In [2]:
!pip install pandas nltk



In [3]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from collections import Counter
from nltk.stem import WordNetLemmatizer

In [4]:
df = pd.read_csv('/content/Twitter Sentiments.csv',low_memory =False)
print(df.head())

   id  label                                              tweet
0   1      0   @user when a father is dysfunctional and is s...
1   2      0  @user @user thanks for #lyft credit i can't us...
2   3      0                                bihday your majesty
3   4      0  #model   i love u take with u all the time in ...
4   5      0             factsguide: society now    #motivation


In [5]:
df = df.drop(columns=['id', 'label'])
df

Unnamed: 0,tweet
0,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty
3,#model i love u take with u all the time in ...
4,factsguide: society now #motivation
...,...
31957,ate @user isz that youuu?ðððððð...
31958,to see nina turner on the airwaves trying to...
31959,listening to sad songs on a monday morning otw...
31960,"@user #sikh #temple vandalised in in #calgary,..."


In [6]:
df['clean_text']= df['tweet'].str.lower()
print(df.head())

                                               tweet  \
0   @user when a father is dysfunctional and is s...   
1  @user @user thanks for #lyft credit i can't us...   
2                                bihday your majesty   
3  #model   i love u take with u all the time in ...   
4             factsguide: society now    #motivation   

                                          clean_text  
0   @user when a father is dysfunctional and is s...  
1  @user @user thanks for #lyft credit i can't us...  
2                                bihday your majesty  
3  #model   i love u take with u all the time in ...  
4             factsguide: society now    #motivation  


In [7]:
nltk.download('stopwords')
",".join(stopwords.words('english'))

stop = set(stopwords.words('english'))
def remove_stopwords(df):
    return " ".join([word for word in df.split() if word not in stop])

df ['clean_text'] = df['clean_text'].apply(lambda x : remove_stopwords(x))

print (df)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                   tweet  \
0       @user when a father is dysfunctional and is s...   
1      @user @user thanks for #lyft credit i can't us...   
2                                    bihday your majesty   
3      #model   i love u take with u all the time in ...   
4                 factsguide: society now    #motivation   
...                                                  ...   
31957  ate @user isz that youuu?ðððððð...   
31958    to see nina turner on the airwaves trying to...   
31959  listening to sad songs on a monday morning otw...   
31960  @user #sikh #temple vandalised in in #calgary,...   
31961                   thank you @user for you follow     

                                              clean_text  
0      @user father dysfunctional selfish drags kids ...  
1      @user @user thanks #lyft credit can't use caus...  
2                                         bihday majesty  
3      #model love u take u time urð±!!! 

#Word Count

In [8]:
word_count = Counter ()
for text in df['clean_text']:
    for word in text.split():
        word_count[word] +=1

print(word_count.most_common(20))


[('@user', 17291), ('&amp;', 1574), ('day', 1454), ('#love', 1449), ('happy', 1328), ('-', 1244), ('u', 1116), ('love', 1112), ("i'm", 992), ('like', 920), ('time', 918), ('â\x80¦', 905), ('get', 903), ('new', 902), ('.', 871), ('#positive', 870), ('good', 734), ('see', 732), ("can't", 727), ('!', 688)]


#Special Character Removal

In [11]:
nltk.download('punkt')
nltk.download('wordnet')
from textblob import TextBlob


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
def form_sentence(tweet):
  tweet_blob = TextBlob(tweet)
  return ' '.join(tweet_blob.words)

In [14]:
df['Refined_data']= df['clean_text'].apply(form_sentence)
df.head()

Unnamed: 0,tweet,clean_text,Data_hashtag,Refined_data
0,@user when a father is dysfunctional and is s...,@user father dysfunctional selfish drags kids ...,user father dysfunctional selfish drags kids d...,user father dysfunctional selfish drags kids d...
1,@user @user thanks for #lyft credit i can't us...,@user @user thanks #lyft credit can't use caus...,user user thanks lyft credit ca n't use cause ...,user user thanks lyft credit ca n't use cause ...
2,bihday your majesty,bihday majesty,bihday majesty,bihday majesty
3,#model i love u take with u all the time in ...,#model love u take u time urð±!!! ððð...,model love u take u time urð± ðððð...,model love u take u time urð± ðððð...
4,factsguide: society now #motivation,factsguide: society #motivation,factsguide society motivation,factsguide society motivation


#Tokenization

In [29]:
from nltk import word_tokenize
from nltk.tokenize import word_tokenize
#function to find the features in the document
# Function to find the features in the document
def find_features(tweet):
    words = word_tokenize(tweet)
    features = {}
    word_features = df['clean_text'].isin(words)
    for w in word_features:
        features[w] = (w in words)
    return features

In [35]:
df['Tokenize'] = df['clean_text'].apply(find_features)
df.head()

Unnamed: 0,tweet,clean_text,Data_hashtag,Refined_data,Tokenize,stemmed_text,Lemmatized_text
0,@user when a father is dysfunctional and is s...,@user father dysfunctional selfish drags kids ...,user father dysfunctional selfish drags kids d...,user father dysfunctional selfish drags kids d...,{False: False},@user father dysfunct selfish drag kid dysfunc...,[u]
1,@user @user thanks for #lyft credit i can't us...,@user @user thanks #lyft credit can't use caus...,user user thanks lyft credit ca n't use cause ...,user user thanks lyft credit ca n't use cause ...,{False: False},@user @user thank #lyft credit can't use caus ...,[u]
2,bihday your majesty,bihday majesty,bihday majesty,bihday majesty,"{False: False, True: False}",bihday majesti,[b]
3,#model i love u take with u all the time in ...,#model love u take u time urð±!!! ððð...,model love u take u time urð± ðððð...,model love u take u time urð± ðððð...,"{False: False, True: False}",#model love u take u time urð±!!! ððð...,[m]
4,factsguide: society now #motivation,factsguide: society #motivation,factsguide society motivation,factsguide society motivation,"{False: False, True: False}",factsguide: societi #motiv,[f]


#Stemming of Words

In [31]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

df["stemmed_text"] = df["clean_text"].apply(stem_words)
print(df.head())

                                               tweet  \
0   @user when a father is dysfunctional and is s...   
1  @user @user thanks for #lyft credit i can't us...   
2                                bihday your majesty   
3  #model   i love u take with u all the time in ...   
4             factsguide: society now    #motivation   

                                          clean_text  \
0  @user father dysfunctional selfish drags kids ...   
1  @user @user thanks #lyft credit can't use caus...   
2                                     bihday majesty   
3  #model love u take u time urð±!!! ððð...   
4                    factsguide: society #motivation   

                                        Data_hashtag  \
0  user father dysfunctional selfish drags kids d...   
1  user user thanks lyft credit ca n't use cause ...   
2                                     bihday majesty   
3  model love u take u time urð± ðððð...   
4                      factsguide society moti

#Lemmatization

In [32]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
def normalization(tweet_list):
  lem = WordNetLemmatizer()
  normalized_tweet = []
  for word in tweet_list:
    normalized_text = lem.lemmatize(word,'v')
    normalized_tweet.append(normalized_text)
    return normalized_tweet

In [34]:
df['Lemmatized_text'] = df['Data_hashtag'].apply(normalization)
df.head()

Unnamed: 0,tweet,clean_text,Data_hashtag,Refined_data,Tokenize,stemmed_text,Lemmatized_text
0,@user when a father is dysfunctional and is s...,@user father dysfunctional selfish drags kids ...,user father dysfunctional selfish drags kids d...,user father dysfunctional selfish drags kids d...,"[@, user, father, dysfunctional, selfish, drag...",@user father dysfunct selfish drag kid dysfunc...,[u]
1,@user @user thanks for #lyft credit i can't us...,@user @user thanks #lyft credit can't use caus...,user user thanks lyft credit ca n't use cause ...,user user thanks lyft credit ca n't use cause ...,"[@, user, @, user, thanks, #, lyft, credit, ca...",@user @user thank #lyft credit can't use caus ...,[u]
2,bihday your majesty,bihday majesty,bihday majesty,bihday majesty,"[bihday, majesty]",bihday majesti,[b]
3,#model i love u take with u all the time in ...,#model love u take u time urð±!!! ððð...,model love u take u time urð± ðððð...,model love u take u time urð± ðððð...,"[#, model, love, u, take, u, time, urð±, !, ...",#model love u take u time urð±!!! ððð...,[m]
4,factsguide: society now #motivation,factsguide: society #motivation,factsguide society motivation,factsguide society motivation,"[factsguide, :, society, #, motivation]",factsguide: societi #motiv,[f]
