In [49]:
import pandas as pd
from feature_extractions_helper import *
from preprocessing_helper import *

In [42]:
SLANG = {
    'smh', 'fwb', 'lmfao', 'lmao', 'lms', 'tbh', 'rofl', 'wtf', 'bff',
    'wyd', 'lylc', 'brb', 'atm', 'imao', 'sml', 'btw', 'bw', 'imho', 'fyi',
    'ppl', 'sob', 'ttyl', 'imo', 'ltr', 'thx', 'kk', 'omg', 'omfg', 'ttys',
    'afn', 'bbs', 'cya', 'ez', 'f2f', 'gtr', 'ic', 'jk', 'k', 'ly', 'ya',
    'nm', 'np', 'plz', 'ru', 'so', 'tc', 'tmi', 'ym', 'ur', 'u', 'sol', 'fml'}

def num_slang_acronym(text):
    '''Count the amount of slang acronyms in a text'''
    return len(re.findall(r"\b({})\b".format("|".join(SLANG)), text.lower()))

In [51]:
PUNCTUATION_LIST = list(string.punctuation)

def get_avg_slang_sent(text):
    """Calculate the slang ratio per text."""
    slang_sent = 0
    split = text.split(" ")
    tokens = [token.strip("".join(PUNCTUATION_LIST)) for token in split]

    for word in tokens:
        if word in slang_df.index and word not in stopwords.words("english"):
            slang_sent += slang_df.loc[word]["sentiment"]

    return slang_sent / len(tokens)  ## avg vs just raw sum

In [6]:
df = pd.read_csv("data/csv/raw_not_annotated.csv")
df.shape

(34436, 19)

In [10]:
# without full_text
## extract features
df["processed_text"] = df["full_text"].apply(preprocessing)
df["processed_text"] = df["processed_text"].apply(lemmatize)
#df["processed_text"] = df["processed_text"].apply(convert_emoji_to_text)

In [12]:
df = df.drop_duplicates("processed_text", keep="last")

print(df.shape)

(30124, 20)


In [21]:
## apply feature extraction functions

df["past_tense_count"] = df["full_text"].apply(count_past_tense)
df["first_preson_pron_count"] = df["full_text"].apply(count_first_person_pro)
df["second_person_pron_count"] = df["full_text"].apply(count_second_person_pro)
df["third_person_pron_count"] = df["full_text"].apply(count_third_person_pro)
df["coord_conj_count"] = df["full_text"].apply(count_coord_conj)
df["future_tense_count"] = df["full_text"].apply(count_future_tense)
df["commas_count"] = df["full_text"].apply(count_commas)
df["multi_punc_count"] = df["full_text"].apply(count_multi_punc)
df["cap_words_count"] = df["full_text"].apply(count_cap_words)
df["slang_acronym_count"] = df["full_text"].apply(num_slang_acronym)
df["sentence_count"] = df["full_text"].apply(num_of_sent)
df["avg_pos_words"] = df["full_text"].apply(get_avg_pos_words)
df["avg_neg_words"] = df["full_text"].apply(get_avg_neg_words)
df["avg_slang_sent"] = df["full_text"].apply(get_avg_slang_sent)
df["avg_len_sent"] = df["full_text"].apply(avg_len_sent)
df["avg_len_tokens"] = df["full_text"].apply(avg_len_tokens)
df["tags_count"] = df["full_text"].apply(count_tags)
df[['common_noun_count', 'proper_noun_count', 'adv_count', 'wh_count']] = pd.DataFrame(df['tags_count'].tolist(), index=df.index)

print(df.shape)

In [92]:
df_selected = df[['processed_text', 'retweet_count', 'favorite_count', 'hashtags_count', 'past_tense_count',
                 'first_preson_pron_count', 'second_person_pron_count', 'third_person_pron_count',
                 'coord_conj_count', 'future_tense_count', 'commas_count', 'multi_punc_count',
                  'cap_words_count', 'slang_acronym_count','sentence_count', 'avg_pos_words',
                  'avg_neg_words', 'avg_slang_sent', 'avg_len_sent', 'avg_len_tokens', 'common_noun_count',
                  'proper_noun_count', 'adv_count', 'wh_count']]

In [95]:
df_selected.head()

Unnamed: 0,processed_text,retweet_count,favorite_count,hashtags_count,past_tense_count,first_preson_pron_count,second_person_pron_count,third_person_pron_count,coord_conj_count,future_tense_count,...,sentence_count,avg_pos_words,avg_neg_words,avg_slang_sent,avg_len_sent,avg_len_tokens,common_noun_count,proper_noun_count,adv_count,wh_count
0,today we add an important chapter to a europea...,1557,5840,2,0,1,0,1,0,2,...,2,0.086957,0.0,-0.025,20.0,4.85,9,3,0,0
1,sedgwick county ha receive it first allocation...,9,50,1,1,1,0,1,2,0,...,2,0.0,0.0,0.0,19.0,5.421053,8,6,0,1
3,i receive my covid vaccination today it wa gre...,139,1008,1,2,3,0,3,1,0,...,3,0.083333,0.0,0.0,14.333333,4.627907,10,1,2,0
4,gen gu perna on the distribution of the modern...,325,1784,1,0,0,0,0,0,0,...,4,0.0,0.0,0.0,3.75,4.642857,2,5,0,0
5,we have to trust the science and our health ca...,55,550,1,0,3,0,1,3,0,...,2,0.114286,0.0,0.0,13.5,4.344828,7,1,0,0


In [96]:
df_selected.to_csv("data/processed/processed_data_30124.csv")