# sentiment-analysis

Use the "Run" button to execute the code.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re
import os
import nltk
import numpy as np
import pandas as pd
from textblob import TextBlob
from string import punctuation
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Sentiment Analysis/train.csv')

In [None]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [None]:
train.drop('id', axis=1, inplace=True)
train['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,29720
1,2242


# Calculating the length of stopwords in each tweets and removing it

In [None]:
stop = stopwords.words('english')

In [None]:
def stopword(data):
  data['stopwords'] = data['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
  print(data[['tweet', 'stopwords']].head())

In [None]:
stopword(train)

                                               tweet  stopwords
0   @user when a father is dysfunctional and is s...         10
1  @user @user thanks for #lyft credit i can't us...          5
2                                bihday your majesty          1
3  #model   i love u take with u all the time in ...          5
4             factsguide: society now    #motivation          1


In [None]:
def stopword_removal(data):
  data['tweet'] = data['tweet'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop))
  print(data['tweet'].head())

In [None]:
stopword_removal(train)

0    @user father dysfunctional selfish drags kids ...
1    @user @user thanks #lyft credit can't use caus...
2                                       bihday majesty
3    #model love u take u time urð±!!! ððð...
4                      factsguide: society #motivation
Name: tweet, dtype: object


# Removing Punctuation

In [None]:
def punctuation(data):
  data['tweet'] = data['tweet'].str.replace(r'[^\w\s]', '')
  print(data['tweet'].head())

In [None]:
punctuation(train)

0    @user father dysfunctional selfish drags kids ...
1    @user @user thanks #lyft credit can't use caus...
2                                       bihday majesty
3    #model love u take u time urð±!!! ððð...
4                      factsguide: society #motivation
Name: tweet, dtype: object


# Removing Most Frequent Words in the Tweets

In [None]:
most_frequency = pd.Series(' '.join(train['tweet']).split()).value_counts()[:10]
most_frequency = list(most_frequency.index)
most_frequency

['@user', '&amp;', 'day', '#love', 'happy', '-', 'u', 'love', 'like', 'time']

In [None]:
def remove_most_frequent(data):
  data['tweet'] = data['tweet'].apply(lambda x: ' '.join(x for x in x.split() if x not in most_frequency))
  print(data['tweet'].head())

In [None]:
remove_most_frequent(train)

0    father dysfunctional selfish drags kids dysfun...
1    thanks #lyft credit can't use cause offer whee...
2                                       bihday majesty
3    #model take urð±!!! ðððð ð¦ð¦...
4                      factsguide: society #motivation
Name: tweet, dtype: object


# Removing Rare words in the Tweets

In [None]:
rare_words = pd.Series(' '.join(train['tweet']).split()).value_counts()[-10:]
rare_words = list(rare_words.index)
rare_words

['dipshit.',
 'tony..',
 'weasel',
 '#liberalisme',
 '#mailboxpride',
 'offended!',
 'omfg',
 'hateful?',
 'deserve)',
 '(even']

In [None]:
def remove_rare_words(data):
  data['tweet'] = data['tweet'].apply(lambda x: ' '.join(x for x in x.split() if x not in rare_words))
  print(data['tweet'].head())

In [None]:
remove_rare_words(train)

0    father dysfunctional selfish drags kids dysfun...
1    thanks #lyft credit can't use cause offer whee...
2                                       bihday majesty
3    #model take urð±!!! ðððð ð¦ð¦...
4                      factsguide: society #motivation
Name: tweet, dtype: object


# Stemming

In [None]:
stemmer = PorterStemmer()

In [None]:
corpus=[]

for i in range(len(train)):
    review = re.sub("[^a-zA-Z]"," ",str(train["tweet"][i]))
    review = review.lower()
    review = review.split()
    review = [stemmer.stem(word) for word in review ]
    review = " ".join(review)
    corpus.append(review)

In [None]:
corpus[:5]

['father dysfunct selfish drag kid dysfunct run',
 'thank lyft credit can t use caus offer wheelchair van pdx disapoint getthank',
 'bihday majesti',
 'model take ur',
 'factsguid societi motiv']

In [None]:
len(corpus), len(train['label'])

(31962, 31962)

# TFIDF Vectorizer

In [None]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus).toarray()
y = train['label']

# Split our data in order to train the model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state=99)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((22373, 31397), (9589, 31397), (22373,), (9589,))

# Naive Bayes Model

In [None]:
naive = MultinomialNB()


In [None]:
from sklearn.model_selection import KFold, cross_val_score


# K-Fold Cross Validation with k=3
kf3 = KFold(n_splits=3, shuffle=True, random_state=99)
scores_k3 = cross_val_score(naive, X[:1000], y[:1000], cv=kf3)
print("3-Fold CV Scores:", scores_k3)
print("Average 3-Fold Score:", np.mean(scores_k3))



3-Fold CV Scores: [0.91916168 0.93693694 0.91891892]
Average 3-Fold Score: 0.9250058441675209


In [None]:
# K-Fold Cross Validation with k=5
kf5 = KFold(n_splits=5, shuffle=True, random_state=99)
scores_k5 = cross_val_score(naive, X[:1000], y[:1000], cv=kf5)
print("\n5-Fold CV Scores:", scores_k5)
print("Average 5-Fold Score:", np.mean(scores_k5))


5-Fold CV Scores: [0.92  0.92  0.94  0.925 0.92 ]
Average 5-Fold Score: 0.925


In [None]:
naive.fit(X_train, y_train)

In [None]:
y_preds = naive.predict(X_test)

In [None]:
y_preds

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
confusion_matrix(y_preds, y_test)

array([[8944,  566],
       [   0,   79]])

In [None]:
acc_naive = accuracy_score(y_preds, y_test)
acc_naive

0.9409740327458547

# Custom testing

In [None]:
# Transform the input string using the fitted TF-IDF vectorizer
input_transformed = tfidf.transform(['it was good day'])

# Predict the sentiment using the trained Naive Bayes model
preds = naive.predict(input_transformed)

print(preds)

[0]


# TextBlob

In [None]:
polarity = 0
positive = 0
negative = 0
neutral = 0

for review in corpus[:10]:
  analysis = TextBlob(review)
  tweet_polarity = analysis.polarity
  if tweet_polarity > 0:
    positive +=1
  elif tweet_polarity < 0:
    negative +=1
  else:
    neutral +=1
  polarity += tweet_polarity

print('The Amount of Positive Tweets: ', positive)
print('The Amount Of Negative Tweets: ',negative)
print('The Amount Of Neutral Tweets: ',neutral)
print('Polarity: ', polarity)

The Amount of Positive Tweets:  1
The Amount Of Negative Tweets:  2
The Amount Of Neutral Tweets:  7
Polarity:  -0.5
