In [1]:
# Vishak Baburaj
# Sentiment Analysis
# On the 2nd variable - Student Emotions

# Exploring and cleaning the data

### Importing required libraries

In [2]:
import pandas as pd
import numpy as np
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns

### Importing dataset CSV file 

In [None]:
data = pd.read_csv(r"C:\Users\visha\Desktop\python\XII Board Exams-2021.csv")
data.head()

### New header for each variable

In [None]:
new_header = {'Timestamp':'timestamp','Which board you are part of?':'syllabus','How do you feel when you are not aware of the status of your board exams? You may select more than one option given below:':'emotion','What is your opinion about Class XII board exams?':'opinion','State the reasons for why do you want the exams to be cancelled? (Safety, Status of mind, Future plans, Health etc.)':'reason_cancelling','State the reasons for why do you want the exams to be conducted on a later date? (Safety, Status of mind, Future plans, Entrance Exams, Health etc.)':'reason_conducting_late','Name the state from which you will be giving your XII Board exam(eg: Karnataka)':'state'}
datanew = data.rename(columns=new_header,inplace=False)
datanew.head()

## Exploring dataset

### View column headers

In [None]:
datanew.columns.values

### Dataset info

In [None]:
datanew.info()

### Describing the dataset

In [None]:
datanew.describe()

### Required variables 

In [None]:
column = ['syllabus','emotion','opinion','reason_cancelling','reason_conducting_late','state']
datanew = datanew[column]
datanew.head(5)

### Finding the null values in each variable of the dataset

In [None]:
print(datanew.isna().sum())

### Deleting rows where their is null values in the following variable

In [None]:
datanew = datanew.dropna(subset=["emotion"])
datanew.head(5)

## Cleaning of data

In [None]:
print(datanew.isna().sum())

In [None]:
len(datanew)

In [None]:
datanew = datanew.iloc[2:]
datanew

### Converting case into lower

In [None]:
for columns in datanew.columns:
    datanew[columns] = datanew[columns].str.lower()
    
datanew.head(5)

In [None]:
opinions = datanew['opinion']
syllabus = datanew['syllabus']
emotions = datanew['emotion']

### Removing special characters

In [None]:
def remove_special_characters(sentence,punctuation=False): 
    sentence = sentence.strip() 
    if punctuation: 
        PATTERN = string.punctuation
        filtered_sentence = re.sub(PATTERN, r' ',sentence) 
    else: 
        PATTERN = r'[^a-zA-Z0-9 ]'  
        filtered_sentence = re.sub(PATTERN, r' ',sentence) 
    return filtered_sentence

In [None]:
cleaned_emotions = [remove_special_characters(sentence) for sentence in emotions] 
print(cleaned_emotions)

# Analysis 1

# Sentiment analysis of students of class 12th regarding their board exams

# Using Lexicon based sentiment analysis

## Appropriate method used Vader Lexicon Sentiment Analysis

In [None]:
!pip install vaderSentiment

### Identifying the negative, positive, neutral and compound score of each response 

In [None]:
neg = []
pos = []
neu = []
compound_score = []

analyzer = SentimentIntensityAnalyzer()
for cleaned_emotion in cleaned_emotions:
    vs = analyzer.polarity_scores(cleaned_emotion)
    negative = vs['neg']
    positive = vs['pos']
    neutral = vs['neu']
    compound = vs['compound']
    neg.append(float(negative))
    pos.append(float(positive))
    neu.append(float(neutral))
    compound_score.append(compound)

### New dataset regarding sentiment of each student response

In [None]:
analysis = {'syllabus': syllabus,'emotions': cleaned_emotions,'opinions': opinions,'emotions_with_punct': emotions,'neg': neg,'pos': pos,'neu': neu,'compound': compound_score} 
sentiments = pd.DataFrame(analysis, columns= ['syllabus','emotions','opinions','emotions_with_punct','neg','pos','neu','compound'])
sentiments.head(5)

In [None]:
sentiments['sentiment'] = 'neutral'
sentiments.loc[sentiments['compound'] >= 0.05, 'sentiment'] = 'positive'
sentiments.loc[sentiments['compound'] <= -0.05, 'sentiment'] = 'negative'
sentiments.head(5)

### Data Visualization of students sentiments in a bar graph

In [None]:
no_of_students = sentiments['sentiment'].value_counts()
no_of_students

In [None]:
len(sentiments)

In [None]:
sns.set(font_scale=1.8)
ax = sentiments['sentiment'].value_counts().plot(kind='bar', figsize=(7, 6), rot=0)
plt.xlabel("Sentiment", labelpad=16)
plt.ylabel("Count of Students", labelpad=16)
plt.title("Number of students feeling negative, positive or neutral type of sentiments in the sample data", y=1.05);

In [None]:
no_of_students_syllabus = sentiments['syllabus'].value_counts()
no_of_students_syllabus

In [None]:
fig, ax = plt.subplots(figsize=(15,6))
sns.countplot(data=sentiments, x=sentiments['syllabus'], hue=sentiments['sentiment'], ax=ax)
plt.xlabel("Syllabus", labelpad=16)
plt.ylabel("Number of Students", labelpad=16)
plt.title("Number of students feeling different type of sentiments based on syllabus in the sample data", y=1.05);

# --------------------------------------------------------

### Finding number of times a student has used a keyword and grouping it by their sentiments

### Splitting the words

In [None]:
sentiments['emotions_with_punct']

In [None]:
split_emotions = sentiments['emotions_with_punct'].str.split(";")
split_emotions

### Word Frequency grouped by negative, neutral and positive sentiments

In [None]:
sentiments['words'] = split_emotions
word_frequency = sentiments.explode('words').groupby('sentiment')['words'].value_counts()
word_frequency

In [None]:
#word_frequency.to_excel (r'Word Frequency Sentiment Analysis.xlsx', index = False, header=True)

### Assigning the variables

In [None]:
word_frequency_negative = word_frequency['negative']
word_frequency_positive = word_frequency['positive']
word_frequency_neutral = word_frequency['neutral']

## Data Visualization (Word Frequency)

In [None]:
sns.set(font_scale=1.8)
ax = word_frequency_negative.plot(kind='bar', figsize=(7, 6), rot=0)
plt.xlabel("Words used", labelpad=16)
plt.xticks(fontsize=12, rotation = '90')
plt.ylabel("Number of times used", labelpad=16)
plt.title("Number of times a negative sentiment student used the following keyword", y=1.05);

In [None]:
sns.set(font_scale=1.8)
ax = word_frequency_positive.plot(kind='bar', figsize=(7, 6), rot=0)
plt.xlabel("Words used", labelpad=16)
plt.xticks(fontsize=12, rotation = '90')
plt.ylabel("Number of times used", labelpad=16)
plt.title("Number of times a positive sentiment student used the following keyword", y=1.05);

In [None]:
sns.set(font_scale=1.8)
ax = word_frequency_neutral.plot(kind='bar', figsize=(7, 6), rot=0)
plt.xlabel("Words used", labelpad=16)
plt.xticks(fontsize=12, rotation = '90')
plt.ylabel("Number of times used", labelpad=16)
plt.title("Number of times a neutral sentiment student used the following keyword", y=1.05);

# Combination of 2 and 3 words

In [None]:
sentiments.head(3)

### Converting emotions into short forms

In [None]:
testing = sentiments['emotions']

In [None]:
sentiments['emotions'] = [w.replace('inability to concentrate', 'ITC') for w in testing] 
sentiments['emotions'] = [w.replace('more time for preparation', 'MTFP') for w in testing] 
sentiments['emotions']

In [None]:
sentiments.head(5)

### Identifying number of words used

In [None]:
sentiments['totalwords'] = [len(x.split()) for x in sentiments['emotions'].tolist()]
sentiments

### Extracting 2 words and 3 words

In [None]:
two_words = sentiments[sentiments['totalwords'] == 2]
columns1 = ['syllabus','emotions','opinions','neg','pos','neu','compound','sentiment','totalwords']
two_words = two_words[columns1]
two_words.head(5)

In [None]:
three_words = sentiments[sentiments['totalwords'] == 3]
columns2 = ['syllabus','emotions','opinions','neg','pos','neu','compound','sentiment','totalwords']
three_words = three_words[columns2]
three_words.head(5)

In [None]:
print(len(two_words))
print(len(three_words))

### Extracting neg and pos 2 words and 3 words

In [None]:
negative_two_words = two_words.loc[two_words['neg'] == 1.0]
positive_two_words = two_words.loc[two_words['pos'] == 1.0]
negative_three_words = three_words.loc[three_words['neg'] == 1.0]
positive_three_words = three_words.loc[three_words['pos'] == 1.0]

In [None]:
print(len(negative_two_words))
print(len(positive_two_words))
print(len(negative_three_words))
print(len(positive_three_words))

### Extracting mixed sentiments 2 words and 3 words

In [None]:
two_words_compound = two_words.loc[two_words['compound'].between (-0.4,0.4, inclusive = True)]
columns3 = ['syllabus','emotions','opinions','neg','pos','neu','compound','sentiment','totalwords']
two_words_compound = two_words_compound[columns3]
two_words_compound.head(5)

In [None]:
three_words_compound = three_words.loc[three_words['compound'].between (-0.4,0.4, inclusive = True)]
columns4 = ['syllabus','emotions','opinions','neg','pos','neu','compound','sentiment','totalwords']
three_words_compound = three_words_compound[columns4]
three_words_compound.head(5)

In [None]:
print(len(two_words_compound))
print(len(three_words_compound))

### Exporting to excel

In [None]:
columns5 = ['syllabus','emotions','opinions','neg','pos','neu','compound','sentiment']
sentiment_analysis = sentiments[columns5]
sentiment_analysis.head(5)

In [None]:
word_frequency

In [None]:
with pd.ExcelWriter('Analysis 1.xlsx', engine='xlsxwriter') as writer:
    sentiment_analysis.to_excel(writer, sheet_name='Sheet1')
    word_frequency.to_excel(writer, sheet_name='Sheet2')
    negative_two_words.to_excel(writer, sheet_name='Sheet3')
    positive_two_words.to_excel(writer, sheet_name='Sheet4')
    negative_three_words.to_excel(writer, sheet_name='Sheet5')
    positive_three_words.to_excel(writer, sheet_name='Sheet6')
    two_words_compound.to_excel(writer, sheet_name='Sheet7')
    three_words_compound.to_excel(writer, sheet_name='Sheet8')