# Sentiment Analysis *using Twitter dataset*
- 19bce150 - Shivam Panchal
- 19bce245 - Aayush Shah

## Importing libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
import re
import nltk 

## Importing dataset

In [2]:
# !git clone https://github.com/Shah-Aayush/Twitter-Sentiment-Analysis.git

In [3]:
train = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/train.csv')
test = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/test.csv')

print(train.shape)
print(test.shape)

## Explanatory Data Analysis

In [4]:
train.head()

In [5]:
test.head()

In [6]:
train.isnull().any()
test.isnull().any()

In [7]:
# checking out the negative comments from the train set

train[train['label'] == 0].head(10)

In [8]:
# checking out the postive comments from the train set 

train[train['label'] == 1].head(10)

In [9]:
train['label'].value_counts().plot.bar(color = 'pink', figsize = (6, 4))

In [10]:
# checking the distribution of tweets in the data

length_train = train['tweet'].str.len().plot.hist(color = 'pink', figsize = (6, 4))
length_test = test['tweet'].str.len().plot.hist(color = 'orange', figsize = (6, 4))

In [11]:
# adding a column to represent the length of the tweet

train['len'] = train['tweet'].str.len()
test['len'] = test['tweet'].str.len()

train.head(10)

In [12]:
train.groupby('label').describe()

In [13]:
train.groupby('len').mean()['label'].plot.hist(color = 'black', figsize = (6, 4),)
plt.title('variation of length')
plt.xlabel('Length')
plt.show()

In [14]:
from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer(stop_words = 'english')
words = cv.fit_transform(train.tweet)

sum_words = words.sum(axis=0)

words_freq = [(word, sum_words[0, i]) for word, i in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)

frequency = pd.DataFrame(words_freq, columns=['word', 'freq'])

frequency.head(30).plot(x='word', y='freq', kind='bar', figsize=(15, 7), color = 'blue')
plt.title("Most Frequently Occuring Words - Top 30")


## WordCloud representations

In [15]:
from wordcloud import WordCloud

wordcloud = WordCloud(background_color = 'white', width = 2000, height = 1000).generate_from_frequencies(dict(words_freq))

plt.figure(figsize=(10,8))
plt.imshow(wordcloud)
plt.title("WordCloud - Vocabulary from Reviews", fontsize = 22)

In [16]:
normal_words =' '.join([text for text in train['tweet'][train['label'] == 0]])

wordcloud = WordCloud(width=800, height=800, random_state = 0, max_font_size = 110).generate(normal_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('The Neutral Words')
plt.show()


In [17]:
negative_words =' '.join([text for text in train['tweet'][train['label'] == 1]])

wordcloud = WordCloud(background_color = 'cyan', width=1000, height=500, random_state = 0, max_font_size = 110).generate(negative_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('The Negative Words')
plt.show()


## Collecting the hashtags

In [18]:
def hashtag_extract(x):
    hashtags = []
    
    for i in x:
        ht = re.findall(r"#(\w+)", i)
        hashtags.append(ht)

    return hashtags

In [19]:
# extracting hashtags from non racist/sexist tweets
HT_regular = hashtag_extract(train['tweet'][train['label'] == 0])

# extracting hashtags from racist/sexist tweets
HT_negative = hashtag_extract(train['tweet'][train['label'] == 1])

# unnesting list
HT_regular = sum(HT_regular,[])
HT_negative = sum(HT_negative,[])

In [20]:
a = nltk.FreqDist(HT_regular)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})

# selecting top 20 most frequent hashtags     
d = d.nlargest(columns="Count", n = 20) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [21]:
a = nltk.FreqDist(HT_negative)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})

# selecting top 20 most frequent hashtags     
d = d.nlargest(columns="Count", n = 20) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

## Tokenization

In [22]:
# tokenizing the words present in the training set
tokenized_tweet = train['tweet'].apply(lambda x: x.split()) 

# importing gensim
import gensim

# creating a word to vector model
model_w2v = gensim.models.Word2Vec(
            tokenized_tweet,
#             size=200, # desired no. of features/independent variables 
            window=5, # context window size
            min_count=2,
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 2, # no.of cores
            seed = 34)

model_w2v.train(tokenized_tweet, total_examples= len(train['tweet']), epochs=20)

In [23]:
model_w2v.wv.most_similar(positive = "dinner")

In [24]:
model_w2v.wv.most_similar(positive = "cancer")

In [25]:
model_w2v.wv.most_similar(positive = "apple")

In [26]:
model_w2v.wv.most_similar(negative = "hate")

In [27]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models.doc2vec import TaggedDocument

In [28]:
def add_label(twt):
    output = []
    for i, s in zip(twt.index, twt):
        output.append(TaggedDocument(s, ["tweet_" + str(i)]))
    return output

# label all the tweets
labeled_tweets = add_label(tokenized_tweet)

labeled_tweets[:6]

In [29]:
# removing unwanted patterns from the data

import re
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


## Stopwords removal

In [30]:
train_corpus = []

for i in range(0, 31962):
  review = re.sub('[^a-zA-Z]', ' ', train['tweet'][i])
  review = review.lower()
  review = review.split()
  
  ps = PorterStemmer()
  
  # stemming
  review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
  
  # joining them back with space
  review = ' '.join(review)
  train_corpus.append(review)

## Stemming

In [31]:
test_corpus = []

for i in range(0, 17197):
  review = re.sub('[^a-zA-Z]', ' ', test['tweet'][i])
  review = review.lower()
  review = review.split()
  
  ps = PorterStemmer()
  
  # stemming
  review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
  
  # joining them back with space
  review = ' '.join(review)
  test_corpus.append(review)

## Bag of Words

In [32]:
# creating bag of words

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 2500)
x = cv.fit_transform(train_corpus).toarray()
y = train.iloc[:, 1]

print(x.shape)
print(y.shape)


In [33]:
# creating bag of words

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 2500)
X_test = cv.fit_transform(test_corpus).toarray()

print(X_test.shape)


## Splitting dataset

In [34]:
# splitting the training data into train and valid sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Feature Scaling

In [35]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_test = sc.transform(X_test)

## Training various models on training set : 

  - ### 1. Logistic Regression

In [36]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(X_train, y_train)

y_pred_lr = clf_lr.predict(X_test)

  - ### 2. K-Nearest Neighbor (KNN)

In [37]:
from sklearn.neighbors import KNeighborsClassifier

clf_knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
clf_knn.fit(X_train, y_train)

y_pred_knn = clf_knn.predict(X_test)

  - ### 3. Support Vector Machine (SVM)

In [38]:
from sklearn.svm import SVC

clf_svc = SVC(kernel='linear', random_state=0)
clf_svc.fit(X_train, y_train)

y_pred_svc = clf_svc.predict(X_test)

  - ### 4. Kernel SVM 

In [39]:
from sklearn.svm import SVC

clf_kernelSVC = SVC(kernel='rbf', random_state=0)
clf_kernelSVC.fit(X_train, y_train)

y_pred_kernelSVC = clf_kernelSVC.predict(X_test)

  - ### 5. Naive Bayes 

In [40]:
from sklearn.naive_bayes import GaussianNB

clf_nb = GaussianNB()
clf_nb.fit(X_train, y_train)

y_pred_nb = clf_nb.predict(X_test)

  - ### 6. Decision Tree 

  - 6.1 with **GINI**

In [41]:
from sklearn.tree import DecisionTreeClassifier

clf_dtGINI = DecisionTreeClassifier(criterion='gini',  random_state=0)
clf_dtGINI.fit(X_train, y_train)

y_pred_dtGINI = clf_dtGINI.predict(X_test)

  - 6.2 with **ENTROPY**

In [42]:
from sklearn.tree import DecisionTreeClassifier

clf_dtENTROPY = DecisionTreeClassifier(criterion='entropy',  random_state=0)
clf_dtENTROPY.fit(X_train, y_train)

y_pred_dtENTROPY = clf_dtENTROPY.predict(X_test)

  - ### 7. Random Forest 

  - 7.1 with **GINI**

In [43]:
from sklearn.ensemble import RandomForestClassifier

clf_rfcGINI = RandomForestClassifier(n_estimators = 10, criterion = 'gini', random_state = 0)
clf_rfcGINI.fit(X_train, y_train)

y_pred_rfcGINI = clf_rfcGINI.predict(X_test)

  - 7.2 with **ENTROPY**

In [44]:
from sklearn.ensemble import RandomForestClassifier

clf_rfcENTROPY = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
clf_rfcENTROPY.fit(X_train, y_train)

y_pred_rfcENTROPY = clf_rfcENTROPY.predict(X_test)

  - ### 8. XG Boost Classification

In [45]:
from xgboost import XGBClassifier

clf_xgboost = XGBClassifier()
clf_xgboost.fit(X_train, y_train)

y_pred_xgboost = clf_xgboost.predict(X_test)

## Evaluating the model performance with Accuracy, Confusion Matrix and F1 score

In [None]:
!pip install -U prettytable

In [49]:
from prettytable import PrettyTable
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [50]:
evaluataionTable = PrettyTable()
evaluataionTable.field_names = ["Model", "Training Accuracy", "Validation Accuracy", "Confusion Matrix", "F1 score"]
evaluataionTable.add_row(["Logistic Regression", accuracy_score(y_test, y_pred_lr), accuracy_score(y_test, y_pred_lr), confusion_matrix(y_test, y_pred_lr), f1_score(y_test, y_pred_knn)])
evaluataionTable.add_row(["------------------------------", "------------------", "------------------", "------------------", "--------------------"])
evaluataionTable.add_row(["K Nearest Neighbor", accuracy_score(y_test, y_pred_knn), accuracy_score(y_test, y_pred_knn), confusion_matrix(y_test, y_pred_knn), f1_score(y_test, y_pred_knn)])
evaluataionTable.add_row(["------------------------------", "------------------", "------------------", "------------------", "--------------------"])
evaluataionTable.add_row(["Support Vector Machine", accuracy_score(y_test, y_pred_svc), accuracy_score(y_test, y_pred_svc), confusion_matrix(y_test, y_pred_svc), f1_score(y_test, y_pred_svc)])
evaluataionTable.add_row(["------------------------------", "------------------", "------------------", "------------------", "--------------------"])
evaluataionTable.add_row(["SVM Kernel", accuracy_score(y_test, y_pred_kernelSVC), accuracy_score(y_test, y_pred_kernelSVC), confusion_matrix(y_test, y_pred_kernelSVC), f1_score(y_test, y_pred_kernelSVC)])
evaluataionTable.add_row(["------------------------------", "------------------", "------------------", "------------------", "--------------------"])
evaluataionTable.add_row(["Naïve Bayes", accuracy_score(y_test, y_pred_nb), accuracy_score(y_test, y_pred_nb), confusion_matrix(y_test, y_pred_nb), f1_score(y_test, y_pred_nb)])
evaluataionTable.add_row(["------------------------------", "------------------", "------------------", "------------------", "--------------------"])
evaluataionTable.add_row(["Decision Tree (with GINI)", accuracy_score(y_test, y_pred_dtGINI), accuracy_score(y_test, y_pred_dtGINI), confusion_matrix(y_test, y_pred_dtGINI), f1_score(y_test, y_pred_dtGINI)])
evaluataionTable.add_row(["------------------------------", "------------------", "------------------", "------------------", "--------------------"])
evaluataionTable.add_row(["Decision Tree (with Entropy)", accuracy_score(y_test, y_pred_dtENTROPY), accuracy_score(y_test, y_pred_dtENTROPY), confusion_matrix(y_test, y_pred_dtENTROPY), f1_score(y_test, y_pred_dtENTROPY)])
evaluataionTable.add_row(["------------------------------", "------------------", "------------------", "------------------", "--------------------"])
evaluataionTable.add_row(["Random Forest (with GINI)", accuracy_score(y_test, y_pred_rfcGINI), accuracy_score(y_test, y_pred_rfcGINI), confusion_matrix(y_test, y_pred_rfcGINI), f1_score(y_test, y_pred_rfcGINI)])
evaluataionTable.add_row(["------------------------------", "------------------", "------------------", "------------------", "--------------------"])
evaluataionTable.add_row(["Random Forest (with ENTROPY)", accuracy_score(y_test, y_pred_rfcENTROPY), accuracy_score(y_test, y_pred_rfcENTROPY), confusion_matrix(y_test, y_pred_rfcENTROPY), f1_score(y_test, y_pred_rfcENTROPY)])
evaluataionTable.add_row(["------------------------------", "------------------", "------------------", "------------------", "--------------------"])
evaluataionTable.add_row(["XG Boost", accuracy_score(y_test, y_pred_xgboost), accuracy_score(y_test, y_pred_xgboost), confusion_matrix(y_test, y_pred_xgboost), f1_score(y_test, y_pred_xgboost)])
print(evaluataionTable)