In [16]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
!pip install gensim
from gensim.models import Word2Vec



In [17]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [18]:
df = pd.read_csv(
    "training.1600000.processed.noemoticon.csv",
    encoding="latin-1",
    header=None,
    on_bad_lines='skip',
    engine='python'
)

In [19]:
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

In [20]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [44]:
df['flag'].value_counts()

Unnamed: 0_level_0,count
flag,Unnamed: 1_level_1
NO_QUERY,309086


In [45]:
df.shape

(309086, 6)

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309086 entries, 0 to 309085
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   target  309086 non-null  int64 
 1   id      309086 non-null  int64 
 2   date    309086 non-null  object
 3   flag    309086 non-null  object
 4   user    309086 non-null  object
 5   text    309086 non-null  object
dtypes: int64(2), object(4)
memory usage: 14.1+ MB


In [47]:
df.sample(5)

Unnamed: 0,target,id,date,flag,user,text
284352,0,1993200820,Mon Jun 01 09:59:46 PDT 2009,NO_QUERY,jovo,my gmail spam filter seems to have stopped wor...
53598,0,1680341269,Sat May 02 11:27:31 PDT 2009,NO_QUERY,callmejersey,Never wanted to go home this bad in my entire ...
69472,0,1693297290,Sun May 03 22:07:16 PDT 2009,NO_QUERY,mrdavenport,@x_silentchaos hey im really sorry but my comp...
194623,0,1970327632,Sat May 30 03:24:35 PDT 2009,NO_QUERY,Kitt69,@marcthom Oh yes!!! I went to the one in Lond...
167044,0,1961529336,Fri May 29 09:38:36 PDT 2009,NO_QUERY,MeredithRae,back to work


In [48]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)

    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return " ".join(tokens)


In [49]:
df_small = df.sample(20000, random_state=42)
df_small['clean_text'] = df_small['text'].apply(clean_text)

In [50]:
df_small[['text', 'clean_text']].head()

Unnamed: 0,text,clean_text
285983,plan for tonight I think? veg out in front of ...,plan tonight think veg front tv early night fe...
148567,@BoyNamedDavid I want to but Rockky and Dale h...,want rockky dale havent texted back
112163,@FlashMcDonnell I dont have that kind of capit...,dont kind capital wisdom mt incredibly stylish...
88136,mooooomm.. please let me go to warped!,mooooomm please let go warped
235643,@LilyStarbuck D: ARGH! i want you to stay,argh want stay


In [51]:
df_small.shape

(20000, 7)

##BAG OF WORDS

In [52]:
bow = CountVectorizer(max_features=3000)
X_bow = bow.fit_transform(df_small['clean_text'])

In [53]:
X_bow.shape

(20000, 3000)

In [54]:
list(bow.vocabulary_.items())[:10]

[('plan', np.int64(1952)),
 ('tonight', np.int64(2674)),
 ('think', np.int64(2615)),
 ('front', np.int64(1024)),
 ('tv', np.int64(2727)),
 ('early', np.int64(783)),
 ('night', np.int64(1765)),
 ('feel', np.int64(924)),
 ('like', np.int64(1511)),
 ('death', np.int64(650))]

In [55]:
bow_df = pd.DataFrame(X_bow.toarray(), columns=bow.get_feature_names_out())
bow_df.head()

Unnamed: 0,aa,aaron,ability,able,absolute,absolutely,abt,abuse,ac,accent,...,yt,yuck,yucky,yuk,yum,yummy,yup,zero,zombie,zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##TF-IDF

In [56]:
tfidf = TfidfVectorizer(max_features=3000)
X_tfidf = tfidf.fit_transform(df_small['clean_text'])

In [57]:
X_tfidf.shape

(20000, 3000)

In [58]:
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df.head()

Unnamed: 0,aa,aaron,ability,able,absolute,absolutely,abt,abuse,ac,accent,...,yt,yuck,yucky,yuk,yum,yummy,yup,zero,zombie,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
bow_df.iloc[0, :5], tfidf_df.iloc[0, :5]

(aa          0
 aaron       0
 ability     0
 able        0
 absolute    0
 Name: 0, dtype: int64,
 aa          0.0
 aaron       0.0
 ability     0.0
 able        0.0
 absolute    0.0
 Name: 0, dtype: float64)

##WORD-2-VEC

In [60]:
tokenized_sentences = df_small['clean_text'].apply(word_tokenize)

In [61]:
w2v_model = Word2Vec(
    sentences=tokenized_sentences,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4
)

In [62]:
len(w2v_model.wv.index_to_key)

7219

In [63]:
w2v_model.wv['good']

array([-0.11279695,  0.53899515,  0.26615092, -0.00988856,  0.15911166,
       -0.9932833 ,  0.3185672 ,  1.2215054 , -0.4925745 , -0.3171243 ,
       -0.10380108, -0.96801084, -0.22859657,  0.3832955 ,  0.16908395,
       -0.31793633,  0.03512402, -0.629536  , -0.15820599, -1.3039128 ,
        0.50207025,  0.31857166,  0.4024182 , -0.29031092, -0.30815235,
        0.19551061, -0.5442534 , -0.40362573, -0.50153106,  0.14664899,
        0.6338699 ,  0.12718979,  0.07831811, -0.6955345 , -0.30962065,
        0.7276721 ,  0.09607083, -0.5851534 , -0.32476422, -1.1517893 ,
       -0.02402025, -0.78994685, -0.2130446 ,  0.07987874,  0.67892456,
       -0.42648858, -0.47940058, -0.18268701,  0.38842183,  0.528544  ,
        0.2153066 , -0.73599553, -0.17615287,  0.01776907, -0.40392205,
        0.33350137,  0.44629574, -0.05553301, -0.5822744 ,  0.11880663,
        0.28560826,  0.18816096,  0.05862152, -0.02898973, -0.5805096 ,
        0.6070543 ,  0.11914711,  0.68459326, -0.66876584,  0.70

In [64]:
w2v_model.wv.most_similar('good')

[('great', 0.9989157915115356),
 ('bad', 0.9988709092140198),
 ('thats', 0.9985624551773071),
 ('hope', 0.9985167980194092),
 ('news', 0.9985138177871704),
 ('feeling', 0.9984697699546814),
 ('better', 0.9984230399131775),
 ('wow', 0.9984006881713867),
 ('working', 0.9982800483703613),
 ('wasnt', 0.9982352256774902)]