# Loading and Preprocessing Data

In [1]:
import pandas as pd
# Reading JSON file into a DataFrame
df = pd.read_csv("D:\\Text Based Analysis Sem 2\\air_india_reviews.csv")
# Displaying a random sample of 10 rows from the DataFrame
df.sample(10)

Unnamed: 0,published_date,rating,type,helpful_votes,title,text
2856,2019-06-04T17:06:34-04:00,1,review,0,management is very bad,the flight was supposed to start @1:30 for whi...
370,2023-11-18T03:32:28-05:00,1,review,0,WORST AIRLINE EVER,They made us wait 12 hours had a technical pro...
668,2023-06-02T12:49:04-04:00,1,review,1,Tried again after 20 years but no improvement,I booked a flight to India via Air India in Ju...
2530,2019-08-07T05:50:37-04:00,2,review,2,Bad experience and no help from the flight com...,"We travelled from Mumbai to Goa, where the fli..."
2644,2019-07-09T19:56:57-04:00,5,review,0,Most underrated,On time and good service. Very smooth touchdow...
3251,2019-02-26T19:38:00-05:00,3,review,0,Worst nightmare airlines - AIR India.,At most times the flight would be delayed or c...
1561,2020-06-14T13:08:28-04:00,1,review,0,Worst Airline Ever!,The Customer Service is a joke!! I have tried ...
2283,2019-10-13T07:18:25-04:00,1,review,1,Missed connecting flight,We were travelling from Varanasi to Delhi and ...
1161,2022-03-08T08:59:57-05:00,1,review,0,Mr Gill,AVOID BOOKING VIA AIR INDIA WEBSITE!!! Been ov...
3837,2018-11-11T11:31:22-05:00,4,review,0,Multicity ticket with Air India. Very useful.I...,I have visited India last September 2018 with ...


# Sentiment Analysis Setup

In [2]:
# Importing the preprocessing module from the scikit-learn library
from sklearn import preprocessing
# Importing the Natural Language Toolkit (NLTK)
import nltk
# Downloading the opinion lexicon dataset from NLTK
nltk.download('opinion_lexicon')
# Importing the opinion_lexicon dataset from the NLTK corpus module
from nltk.corpus import opinion_lexicon
# Importing the word_tokenize function from the NLTK tokenize module
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


In [3]:
# Printing the total number of words in the opinion_lexicon dataset
print('Total number of words in opinion lexicon', len(opinion_lexicon.words()))
# Printing examples of positive words in the opinion_lexicon dataset
print('Examples of positive words in opinion lexicon',
opinion_lexicon.positive()[:10])
# Printing examples of negative words in the opinion_lexicon dataset
print('Examples of negative words in opinion lexicon',
opinion_lexicon.negative()[:10])

Total number of words in opinion lexicon 6789
Examples of positive words in opinion lexicon ['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation']
Examples of negative words in opinion lexicon ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted']


# Applying Sentiment Analysis

In [4]:
# Let's create a dictionary which we can use for scoring our review text
# Downloading the 'punkt' tokenizer from NLTK, which is required for word tokenization
nltk.download('punkt')
# Renaming the column 'reviewText' to 'text' for consistency and clarity
df.rename(columns={"text": "text"}, inplace=True)
# Assigning positive and negative scores for sentiment analysis
pos_score = 1
neg_score = -1
# Creating an empty dictionary to store word scores for sentiment analysis
word_dict = {}

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data Exploration

In [5]:
# Adding the positive words to the dictionary
for word in opinion_lexicon.positive():
    word_dict[word] = pos_score

# Function to Calculate Sentiment Score using Bing Liu's Lexicon

In [6]:
# Define a function to calculate the sentiment score of a given text using Bing Liu's lexicon.
def bing_liu_score(text):
    sentiment_score = 0 # Initialize the sentiment score to zero.
    bag_of_words = word_tokenize(text.lower())
    # Iterate over each word in the bag of words.
    for word in bag_of_words:
        # Check if the word exists in the word dictionary.
        if word in word_dict:
            sentiment_score += word_dict[word]
    return sentiment_score # Return the final sentiment score of the text.

# Data Preprocessing and Sentiment Analysis

In [7]:
# Fill missing values in the 'text' column with 'no review'
df['text'].fillna('no review', inplace=True)
# Apply the bing_liu_score function to calculate sentiment scores for each text in the 'text' column and create a new column 'Bing_Liu_Score'
df['Bing_Liu_Score'] = df['text'].apply(bing_liu_score)

# Displaying Selected Columns for Analysis

In [9]:
# Displaying the first 10 rows of selected columns ('overall', 'text', 'Bing_Liu_Score')
df[['rating',"text", 'Bing_Liu_Score']].head(10)

Unnamed: 0,rating,text,Bing_Liu_Score
0,1,From their side my baggage is broken but still...,0
1,1,I took AI 188 noon -9th March 2024 - it was de...,0
2,1,one of worst flight to fly with book one way l...,1
3,3,The positive aspects on my flight were: punctu...,5
4,1,"Extremally poor services in flight, aircraft i...",4
5,1,i booked a flight del to Vancouver and return ...,0
6,1,Very disappointed service and even they don’t ...,2
7,2,If a travelled with two kids staff not support...,2
8,1,Absolutely disgusting service. They said my se...,1
9,1,"Very bad experience with baggage service, I la...",1


# Grouping and Aggregating Sentiment Scores

In [10]:
# Grouping the DataFrame by the 'overall' column and calculating the mean of 'Bing_Liu_Score' for each group
df.groupby('rating').agg({'Bing_Liu_Score':'mean'})

Unnamed: 0_level_0,Bing_Liu_Score
rating,Unnamed: 1_level_1
1,2.584187
2,3.339806
3,3.613008
4,4.616052
5,5.23816
