Import python libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from jupyterthemes import jtplot
import seaborn as sns
jtplot.style(theme= 'monokai', context= 'notebook', ticks= True, grid= False)

Read the dataset

In [None]:
df = pd.read_csv('sentiment_tweets3.csv')

Get the info and description about the DataFrame

In [None]:
df.info()

In [None]:
df.describe()

Dropping the Column "Index"

In [None]:
df = df.drop(['Index'], axis=1)

Get the updated info about the Data Frame

In [None]:
df.info()

Plot based on label (depression result) vs No. of samples

In [None]:
df.hist(bins=15, figsize=(10,5), color ='b')

Create a new column with name "length"

In [None]:
df['length'] = df['message to examine'].apply(len)

Get updated description of Data Frame

In [None]:
df.describe()

Display a histogram plot of column "length"

In [None]:
df['length'].plot(bins = 10, kind='hist')

Differentiate the data into classes using the labels

In [None]:
positive = df[df['label (depression result)']==0]
negative = df[df['label (depression result)']==1]

Combine all the messages to a single sentence

In [None]:
sentences = df['message to examine'].tolist()
sentence = " ".join(sentences)

Combine all the messages in each class as a single sentence

In [None]:
pos_sentences = positive['message to examine'].tolist()
pos_sentence = " ".join(pos_sentences)

neg_sentences = negative['message to examine'].tolist()
neg_sentence = " ".join(neg_sentences)

Install WordCloud

In [None]:
!pip install wordcloud

Display the wordcloud for the complete dataset

In [None]:
from wordcloud import WordCloud

plt.figure(figsize=(10,10))
plt.imshow(WordCloud().generate(sentence))

Display the wordcloud for the class "positive"

In [None]:
from wordcloud import WordCloud

plt.figure(figsize=(10,10))
plt.imshow(WordCloud().generate(pos_sentence))

Display the wordcloud for the class "negative"

In [None]:
from wordcloud import WordCloud

plt.figure(figsize=(10,10))
plt.imshow(WordCloud().generate(neg_sentence))

Import libraries for Data Cleaning

In [None]:
import string

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

Clean the messages

In [None]:
def message_cleaning(message):
    Test_punc_removed = [char for char in message if char not in string.punctuation]
    Test_punc_removed_joined = "".join(Test_punc_removed)
    Test_punc_removed_joined_clean = [word for word in Test_punc_removed_joined.split() if word.lower() not in stopwords.words('english')]
    return Test_punc_removed_joined_clean

Create a Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=message_cleaning)
message_vectorizer = vectorizer.fit_transform(df['message to examine'])

Input the Data Frame columns for training

In [None]:
X = pd.DataFrame(message_vectorizer.toarray())

In [None]:
y = df['label (depression result)']

Split the data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Create and train the Naive Bayes model

In [None]:
from sklearn.naive_bayes import MultinomialNB

NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

Predict the output for the test set

In [None]:
y_predict = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
sns.heatmap(cm , annot=True)

Get the accuracy and precision

In [None]:
print(classification_report(y_test, y_predict))