# Sentiment Analysis of Tweets Using NLTK
# Analytics Vidhya Hackathon

In [4]:
#Import libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
import re
import string
sns.set_style('white')
%matplotlib inline

In [6]:
#Load dataframe
tweets = pd.read_csv('train.csv')

# Exporatory Data Analysis 

In [7]:
tweets.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [8]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7920 non-null   int64 
 1   label   7920 non-null   int64 
 2   tweet   7920 non-null   object
dtypes: int64(2), object(1)
memory usage: 185.8+ KB


# Data cleaning 

In [12]:
#defining function to remove the urls from the tweets
def remove_url(tweet):
    tweet = re.sub(r'http[\:\/\/./A-Za-z0-9]+', '', tweet)
    return tweet

tweets['tweet'] = tweets['tweet'].apply(remove_url)
tweets.head(10)

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test #android #apps #...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...
5,6,1,iPhone software update fucked up my phone big ...
6,7,0,Happy for us .. #instapic #instadaily #us #son...
7,8,0,New Type C charger cable #UK -/112598674021 … ...
8,9,0,Bout to go shopping again listening to music #...
9,10,0,Photo: #fun #selfie #pool #water #sony #camera...


In [30]:
#remove unwanted speacial characters
#stemming
#lowering the alphabets
#spliting the words
#removing the stopwrods and joining it
#load test in the corpus

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
corpus=[]
for i in range(0, len(tweets)):
    text = re.sub('[^a-zA-Z]', ' ', tweets['tweet'][i])
    text= text.lower()
    text=text.split()
    
    text=[ps.stem(word) for word in text if not word in stopwords.words('english')]
    text=' '.join(text)
    corpus.append(text)

In [31]:
corpus

['fingerprint pregnanc test android app beauti cute health iger iphoneonli iphonesia iphon',
 'final transpar silicon case thank uncl yay soni xperia sonyexperia',
 'love would go talk makememori unplug relax iphon smartphon wifi connect',
 'wire know georg made way iphon cute daventri home uj k',
 'amaz servic appl even talk question unless pay stupid support',
 'iphon softwar updat fuck phone big time stupid iphon',
 'happi us instap instadaili us soni xperia xperiaz',
 'new type c charger cabl uk bay amazon etsi new year rob cross tobi young evemun mcmafia taylor spectr newyear start recip technolog samsunggalaxi iphonex pic twitter com pjiwq wtc',
 'bout go shop listen music iphon justm music likeforlik followforfollow',
 'photo fun selfi pool water soni camera picoftheday sun instagood boy cute outdoor gp',
 'hey appl make new ipod dont make new color inch thinner make crash everi five fuckin minit',
 'ha heavi machineri need appl realli drop ball design drinkyourhaterad',
 'conte

In [36]:
#we can apply this method also to clean the data
#clean the dataframe tweets
#create the func to clean the text

#def clean_tweets(tweet):
#    tweet = re.sub(r'@[A-Za-z0-9]+', '', tweet) #remove @mentions
#    tweet = re.sub(r'#', '', tweet)   #remove '#' tag
#    tweet = re.sub(r'[RT]+', '', tweet) #remove RT
#    tweet = re.sub(r'[.,:$^&*);^@!?]+', '', tweet)
     
#    return tweet


In [34]:
#tweets['tweet'] = tweets['tweet'].apply(clean_tweets)
#tweets.head(-10)

In [37]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [38]:
x=corpus

In [39]:
y=tweets['label']

In [40]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.33)

In [41]:
from sklearn.pipeline import Pipeline
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import PassiveAggressiveClassifier

In [118]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier',MultinomialNB(alpha=0.1))
])

In [119]:
pipeline.fit(x,y)

Pipeline(steps=[('bow', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB(alpha=0.1))])

In [120]:
pred = pipeline.predict(x_test)

In [121]:
from sklearn.metrics import classification_report, confusion_matrix

In [122]:
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1853
           1       0.97      0.87      0.92       761

    accuracy                           0.96      2614
   macro avg       0.96      0.93      0.94      2614
weighted avg       0.96      0.96      0.95      2614



In [97]:
#Lets apply model on test data

test = pd.read_csv('test.csv')
ID = test['id']
test.head()

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me d...
1,7922,currently shitting my fucking pants. #apple #i...
2,7923,"I'd like to puts some CD-ROMS on my iPad, is t..."
3,7924,My ipod is officially dead. I lost all my pict...
4,7925,Been fighting iTunes all night! I only want th...


In [61]:
test.drop(['id'], axis=1, inplace=True)

In [62]:
test_clean = test['tweet'].apply(remove_url)

In [1106]:
#test_cleaned = test['tweet'].apply(clean_tweets)

In [1107]:
#test_stop = test['tweet'].apply(remove_stopwords)

In [1108]:
#test_stop.head()

In [63]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
corpus1=[]
for i in range(0, len(test)):
    text = re.sub('[^a-zA-Z]', ' ', test['tweet'][i])
    text= text.lower()
    text=text.split()
    
    text=[ps.stem(word) for word in text if not word in stopwords.words('english')]
    text=' '.join(text)
    corpus1.append(text)

In [123]:
x_test1=corpus1

In [124]:
predict = pipeline.predict(x_test1)

In [125]:
sub = pd.DataFrame({'id':ID, 'label':predict})
sub.to_csv('sentiment_predictions_final.csv', index = False)