In [2]:
# Import all required libraries
from sklearn.cluster import KMeans
import nltk
import numpy as np
import re
import pandas as pd 
import pylab as pl
import matplotlib.pyplot as plt

from nltk.tokenize import WordPunctTokenizer
from bs4 import BeautifulSoup
#from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD

from sklearn import metrics
from mpl_toolkits.mplot3d import Axes3D

from matplotlib import pyplot

#plt.style.use('fivethirtyeight')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

!pip install datashader
import datashader as ds
import datashader.transfer_functions as tf



In [3]:
df=pd.read_csv("train.csv",engine='python')

In [4]:
df.head()

Unnamed: 0,ItemID,SentimentText
0,1,is so sad for my APL frie...
1,2,I missed the New Moon trail...
2,3,omg its already 7:30 :O
3,4,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,i think mi bf is cheating on me!!! ...


In [5]:
df=df.rename({'SentimentText': 'full_text'},axis=1)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99989 entries, 0 to 99988
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ItemID     99989 non-null  int64 
 1   full_text  99989 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [7]:
df.drop(['ItemID'],axis=1,inplace = True)

In [8]:
df.head()

Unnamed: 0,full_text
0,is so sad for my APL frie...
1,I missed the New Moon trail...
2,omg its already 7:30 :O
3,.. Omgaga. Im sooo im gunna CRy. I'...
4,i think mi bf is cheating on me!!! ...


In [9]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt

In [10]:
df['Clean_text'] = np.vectorize(remove_pattern)(df['full_text'], "@[\w]*")

In [11]:
# remove special characters, numbers, punctuations
df['Clean_text'] = df['Clean_text'].str.replace("[^a-zA-Z#]", " ")

In [12]:
df.head(50)

Unnamed: 0,full_text,Clean_text
0,is so sad for my APL frie...,is so sad for my APL frie...
1,I missed the New Moon trail...,I missed the New Moon trail...
2,omg its already 7:30 :O,omg its already O
3,.. Omgaga. Im sooo im gunna CRy. I'...,Omgaga Im sooo im gunna CRy I ...
4,i think mi bf is cheating on me!!! ...,i think mi bf is cheating on me ...
5,or i just worry too much?,or i just worry too much
6,Juuuuuuuuuuuuuuuuussssst Chillin!!,Juuuuuuuuuuuuuuuuussssst Chillin
7,Sunny Again Work Tomorrow :-| ...,Sunny Again Work Tomorrow ...
8,handed in my uniform today . i miss you ...,handed in my uniform today i miss you ...
9,hmmmm.... i wonder how she my number @-),hmmmm i wonder how she my number


In [13]:
tokenized_tweet = df['Clean_text'].apply(lambda x: x.split())

In [14]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet.head()

0                  [is, so, sad, for, my, apl, friend]
1                   [I, miss, the, new, moon, trailer]
2                                [omg, it, alreadi, O]
3    [omgaga, Im, sooo, im, gunna, cri, I, ve, been...
4          [i, think, mi, bf, is, cheat, on, me, T, T]
Name: Clean_text, dtype: object

In [15]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

df['Clean_text'] = tokenized_tweet

In [16]:
df.loc[:,('full_text','Clean_text')]

Unnamed: 0,full_text,Clean_text
0,is so sad for my APL frie...,is so sad for my apl friend
1,I missed the New Moon trail...,I miss the new moon trailer
2,omg its already 7:30 :O,omg it alreadi O
3,.. Omgaga. Im sooo im gunna CRy. I'...,omgaga Im sooo im gunna cri I ve been at thi d...
4,i think mi bf is cheating on me!!! ...,i think mi bf is cheat on me T T
...,...,...
99984,@Cupcake seems like a repeating problem hop...,seem like a repeat problem hope you re abl to ...
99985,@cupcake__ arrrr we both replied to each other...,arrrr we both repli to each other over differ ...
99986,@CuPcAkE_2120 ya i thought so,ya i thought so
99987,@Cupcake_Dollie Yes. Yes. I'm glad you had mor...,ye ye I m glad you had more fun with me


In [17]:
#tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# Considering 3 grams and mimnimum frq as 0
tf_idf_vect = CountVectorizer(analyzer='word',ngram_range=(1,1),stop_words='english', min_df = 0.0001)
tf_idf_vect.fit(df['Clean_text'])
desc_matrix = tf_idf_vect.transform(df["Clean_text"])

In [18]:
# implement kmeans
num_clusters = 2
km = KMeans(n_clusters=num_clusters)
km.fit(desc_matrix)
clusters = km.labels_.tolist()

In [19]:
# create DataFrame films from all of the input files.
tweets = {'Tweet': df["Clean_text"].tolist(), 'Cluster': clusters}
frame = pd.DataFrame(tweets, index = [clusters])
frame.tail(20)

Unnamed: 0,Tweet,Cluster
1,sorri,1
1,damn it dont have sky,1
1,that s the thing the new raft of star war film...,1
1,,1
1,#followfriday,1
1,#awaresg you are not wrong but from a my own m...,1
1,cuz you big burli man hahahahahahahahaha,1
1,tri to get a wider rang of shirt to suit every...,1
1,haha I love the passion in your support,1
1,that suck I like live in coopersvil I don t ne...,1


In [20]:
frame['Cluster'].value_counts()

1    95846
0     4143
Name: Cluster, dtype: int64

In [23]:
frame[frame['Cluster'] == 1].head()

Unnamed: 0,Tweet,Cluster
1,is so sad for my apl friend,1
1,I miss the new moon trailer,1
1,omg it alreadi O,1
1,omgaga Im sooo im gunna cri I ve been at thi d...,1
1,i think mi bf is cheat on me T T,1


In [24]:
frame[frame['Cluster'] == 0].head()

Unnamed: 0,Tweet,Cluster
0,ju got hom Fr tda funer I m So sad I cri So mu...,0
0,waahhh now I m get sad miss hub quot quot quot...,0
0,quot vandal paint swastika on home of author o...,0
0,quot An unknown error occur quot uh oh iphon O...,0
0,quot I know you hungri so let go outsid on the...,0


# ---------------------------------------------------------------------------------------------------------------