### Import

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
import pickle
import re
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from ordered_set import OrderedSet


In [16]:
cleaned_df = pd.read_csv('resources/cleaned_tweets.csv')

In [17]:
def clean_text(text):
    ps = PorterStemmer()
    t = text
    t = re.sub(r'[^a-zA-Z]', ' ', t)
    t = re.sub(r'\s+', ' ', t)
    t = t.lower()
    t = t.strip()
    t = word_tokenize(t)
    stop_dict = set(stopwords.words('English'))
    t = list(OrderedSet(t) - stop_dict)
    t = [word for word in t if len(word)>2]
    t = [ps.stem(w) for w in t]
    t = ' '.join(t)
    
    return t

### Tfid Vectorizer

In [18]:
tf_idf_vectorizer = TfidfVectorizer(ngram_range=(1,1), preprocessor=clean_text)
tf_idf_vectorizer.fit(cleaned_df['cleaned_text'])
tf_idf_df = tf_idf_vectorizer.transform(cleaned_df['cleaned_text'])
# tf_idf_df = pd.DataFrame.sparse.from_spmatrix(tf_idf_df,columns=tf_idf_vectorizer.get_feature_names_out())
tf_idf_df = pd.DataFrame(tf_idf_df.toarray(),columns=tf_idf_vectorizer.get_feature_names_out())
tf_idf_df

Unnamed: 0,aaa,aaaa,aaaaa,aaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaah,aaaaaaaaaah,aaaaaaaaaajajajajajajajahahahajahaja,aaaaaaaahhhhhhh,aaaaah,aaaaargh,...,zyrrlt,zython,zyvmzm,zyvmzmeea,zyxeq,zzoegrimm,zzsuo,zzz,zzzz,zzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
pd.DataFrame(tf_idf_vectorizer.transform(['test the thingy']).toarray(),columns=tf_idf_vectorizer.get_feature_names_out())

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaah,aaaaaaaaaah,aaaaaaaaaajajajajajajajahahahajahaja,aaaaaaaahhhhhhh,aaaaah,...,zyrrlt,zython,zyvmzme,zyvmzmeea,zyxeq,zzoegrimm,zzsuo,zzz,zzzz,zzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
tf_idf_df.to_parquet('resources/tf_idf.parquet.gzip', compression='gzip', index=False)


In [20]:
pickle.dump(tf_idf_vectorizer, open('resources/tfidf_vect.pkl', 'wb'))

### Variance Threshold

In [7]:
var_thr = VarianceThreshold(threshold=0.0005)
var_thr.fit(tf_idf_df)
var_df = tf_idf_df[var_thr.get_feature_names_out()]

In [8]:
print(len(tf_idf_vectorizer.get_feature_names_out())-len(var_thr.get_feature_names_out()),'features removed.')

377797 features removed.


In [9]:
var_df

Unnamed: 0,ass,ass nigger,bitch,bulli,bulli high,call,dumb,dumb ass,dumb nigger,fuck,...,obama dumb,one,peopl,rape,say,school,school bulli,tayyoung,tayyoung fuck,think
0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47233,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47234,0.0,0.0,0.000000,0.0,0.0,0.052413,0.000000,0.0,0.000000,0.000000,...,0.0,0.052672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47235,0.0,0.0,0.104151,0.0,0.0,0.000000,0.083660,0.0,0.120870,0.076311,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47236,0.0,0.0,0.000000,0.0,0.0,0.000000,0.100473,0.0,0.145161,0.183295,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
var_df.to_csv('resources/var_tf_idf.csv', encoding='utf-8', index=False)
pickle.dump(var_thr, open('resources/var_thr.pkl', 'wb'))