## Sentiment Analysis with Twitter

In [1]:
# importing libraries
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

pd.set_option('max_colwidth', 800)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load the CSV

In [2]:
# loading dataset
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation


## Cleaning Data

In [3]:
# Cleaning Raw tweets
def clean_text(text):
    
    #remove emails
    text = ' '.join([i for i in text.split() if '@' not in i])
    
    #remove web address
    text = re.sub('http[s]?://\S+', '', text)

    #remove punctuations
    text = re.sub(r'[^\w\s]', '', text)
    
    #Filter to allow only alphabets
    text = re.sub(r'[^a-zA-Z0-9\']', ' ', text)
    
    #Remove Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    #remove double spaces 
    text = re.sub('\s+', ' ', text)
    
    return text

In [4]:
df["clean_tweet"] = df.tweet.apply(lambda x: clean_text(x))

In [5]:
df.head(5)

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,when a father is dysfunctional and is so selfish he drags his kids into his dysfunction run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,thanks for lyft credit i cant use cause they dont offer wheelchair vans in pdx disapointed getthanked
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,model i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now motivation


In [6]:
# dropping the uncleaned text column "tweet"
df = df.drop(['tweet'], axis=1)

In [7]:
# renaming the column clean_tweet to 'tweet
df.rename(columns = {'clean_tweet':'tweet'}, inplace = True)

In [8]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so selfish he drags his kids into his dysfunction run
1,2,0,thanks for lyft credit i cant use cause they dont offer wheelchair vans in pdx disapointed getthanked
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in ur
4,5,0,factsguide society now motivation


In [9]:
df.shape

(31962, 3)

## Preprocessing the data

In [10]:
# preprocessing the data (removing stopwords, collecting the words in a vector form)
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
corpus=[]
for i in range (0, len(df)):
    review = re.sub('[^a-z]', ' ', df['tweet'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [11]:
# convert text data to vector form using TFIDF VECTORIZER
from sklearn.feature_extraction.text import TfidfVectorizer
cv=TfidfVectorizer(max_features=2000)
x=cv.fit_transform(corpus).toarray()

In [12]:
y=df['label']

## TRAIN-TEST SPLIT

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

## MODEL BUILDING USING MULTINOMIAL NAIVE BAYES

In [14]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()
clf.fit(x_train, y_train)
y_pred=clf.predict(x_test)

In [15]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print('accuracy_score: ', accuracy_score(y_test, y_pred))

[[5966   19]
 [ 267  141]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5985
           1       0.88      0.35      0.50       408

    accuracy                           0.96      6393
   macro avg       0.92      0.67      0.74      6393
weighted avg       0.95      0.96      0.95      6393

accuracy_score:  0.9552635695291726


## Conclusion: My model gives me overall 96% accuracy.