# **Q1. Problem Statement: Sentiment Analysis**<br>
Write a Python program that reads the mood_data.txt.<br> The following are the given tasks, that has to be taken into
consideration while constructing the solution.<br>
Here dataset contains two columns where one is our target <br>(“emotion” has 6
different categories) and another is the independent variable (“Text” contains
data in form of sentences).
1. Load the mobile mood_data.txt data into a DataFrame
2. Generate tokens and remove punctuations, stop words and lower all rows
3. Join all the tokens as they were before and store them in a new column named
“cleaned_text”
4. Now remove all single characters, extra space, and special characters and<br>
store processed data in a new column named “processed_text”
5. Create a final DataFrame containing dependent variable(emotion) and
processed text
6. Extract independent variables (Xs) and dependent variables (Ys) into separate<br>
data objects
7. Generate tokens and do vectorization

8. Build a model with Multinomial Naive Bayes, Random Forest, Random Forest <br>
(Entropy), SVM and compare their accuracy


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

**Step-1:** Importing Libraries.

In [None]:
# Load the required libraries from Python
# Make sure all the libraries have been download else download using nltk.download command
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk 

**Step-2:** Loading sample data set into dataframe.

In [None]:
df_train = pd.read_csv('mood_data.txt', names=['Text', 'Emotion'], sep=';') # load the dataset onto the google colab file section

In [None]:
df_train.shape

In [None]:
df_train.head()

**Step-3:** Generating tokens and remove punctuations, stop words and converting all rows to lower case.

In [None]:
# Load the required libraries for cleaning
import string,re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
# Create a function to generate cleaned data from raw text
def clean_text(mood):
    mood = word_tokenize(mood) # Create tokens
    mood= " ".join(mood) # Join tokens
    mood = [char for char in mood if char not in string.punctuation] # Remove punctuations
    mood = ''.join(mood) # Join the leters
    mood = [word for word in mood.split() if mood.lower() not in stopwords.words('english')] # Remove common english words (I, you, we,...)
    return " ".join(mood)

**Step-4:** Storing new data in cleaned_text column.

In [None]:
# Apply the function to 'text' to clean it
# Add cleaned data as a separate column to the DataFrame
df_train['cleaned_text'] = df_train['Text'].apply(clean_text)
df_train

In [None]:
df_train["cleaned_text"].head()

**Step-4:** Removing special charachters,extra space,and convert into lower case

In [None]:
features = df_train['cleaned_text']
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))
    
    # Remove single characters appearing in the text except the start
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    
    # Remove single characters appearing at the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 
    
    # Substitute multiple spaces with a single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
    
    
    # Convert to lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [None]:
# Print first five values of processed data
processed_features[:5]

**Step-5:** Saving the above processed data into processed_text column

In [None]:
# Add the processed data as a separate column to the DataFrame

df_train['processed_text'] = processed_features
df_train

**Step-6:** Extracting processed_text and Emotion then creating final dataframe.

In [None]:
final_df = df_train[["processed_text","Emotion"]]
final_df

**Step-7:** Generating tokens and doing vectorization

In [None]:
# Tokenize the text using TweetTokenizer from NLTK

from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
# Function to generate tokens using TweetTokenizer
def tokenize(text): 
    tk = TweetTokenizer()
    return tk.tokenize(text)

vectorizer = CountVectorizer(analyzer = 'word',tokenizer = tokenize,lowercase = True,ngram_range=(1, 1))

In [None]:
# Generate unique words from the processed data by applying Count Vectorizer along with TweetTokenizer
count= vectorizer.fit_transform(final_df['processed_text'])

In [None]:
# What is the shape of the data- Count vectorizer provides information about unique words present in data
count.shape

In [None]:
# Load the libraries required for performing classification

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

**Step-8:** Spliting the data into training and testing data sets

In [None]:
# Use processed data as independent variable and polarity as dependent variable

X = final_df['processed_text'].values
y = final_df['Emotion'].values

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=100, test_size=0.3)

**Step-9:** Doing vectorization for training and testing data

In [None]:
# Extract features using TFIDF Vectorizer

vectorizer = TfidfVectorizer(max_features=1000)
X_train_idf = vectorizer.fit_transform(X_train)
X_test_idf = vectorizer.transform(X_test)

In [None]:
# Print idf values
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names_out(),columns=["idf_weights"])
# Sort ascending
df_idf.sort_values(by=['idf_weights'],ascending = False).head()

**Step-10:** Model building(generate asked model) and model evaluation

In [None]:
# Perform Multinomial Naive Bayes Classification
# Apply MultinomialNB on training data
mnb = MultinomialNB()
mnb.fit(X_train_idf, y_train)

In [None]:
# Predict polarity by fitting the model to testing data
pred_mnb = mnb.predict(X_test_idf)

# Calculate accuracy of predicted values
acc = accuracy_score(y_test, pred_mnb)


results = pd.DataFrame([['Multinomial Naive Bayes', acc]],
               columns = ['Model', 'Accuracy'])

print(results)

In [None]:
# Perform Random Forest classification on the processed data and compare the accuracy score of both these models

# Random Forest Classifier with 'gini'

from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train_idf, y_train)

# Predict using testing data
y_pred_rf = clf_rf.predict(X_test_idf)

# Calculate accuracy
acc = accuracy_score(y_test, y_pred_rf)

model_results = pd.DataFrame([['Random Forest(Gini)', acc]],
               columns = ['Model', 'Accuracy'])

results = pd.concat([results,model_results], ignore_index = True)
print(results)

In [None]:
# Random Forest Classifier with 'entropy'

from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(criterion='entropy')
clf_rf.fit(X_train_idf, y_train)

# Predict using testing data
y_pred_rf = clf_rf.predict(X_test_idf)

# Calculate accuracy
acc = accuracy_score(y_test, y_pred_rf)

model_results = pd.DataFrame([['Random Forest(Entropy)', acc]],
               columns = ['Model',  'Accuracy'])

results = pd.concat([results,model_results], ignore_index = True)
print(results)

In [None]:
#svm model
from sklearn.svm import SVC
clf_svc = SVC()
clf_svc.fit(X_train_idf, y_train)

# Predict using testing data
y_pred_rf = clf_svc.predict(X_test_idf)

# Calculate accuracy
acc = accuracy_score(y_test, y_pred_rf)

model_results = pd.DataFrame([['SVC by SVM ', acc]],
               columns = ['Model', 'Accuracy'])

results = pd.concat([results,model_results], ignore_index = True)
print(results)

In [None]:
# Display confusion matrix for Random Forest

confusion_matrix(y_test,y_pred_rf) ### Confusion matrix for Random Forest

**Conclusion** : Random forrest classifier has performed the best.