In [None]:
import pandas as pd
import numpy as np
import nltk
import textract
import docx2txt
import matplotlib.pyplot as plt
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from gensim import corpora, models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data = pd.read_excel("Flysafe Airlines.xlsx", index_col=[0])

# EDA

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 308384 entries, 0 to 308383
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   comment  308368 non-null  object
 1   label    308384 non-null  object
dtypes: object(2)
memory usage: 7.1+ MB


Original Data Values

In [None]:
data.shape

(308384, 2)

In [None]:
data.describe()

Unnamed: 0,comment,label
count,308368,308384
unique,304133,2
top,Good service,Postive
freq,145,271113


In [None]:
data.head()

Unnamed: 0,comment,label
0,"Mohammad harun, he is an awesome guy very info...",Postive
1,"amazing guy gaurav was, so patience and kind. ...",Postive
2,Gaurav was very knowledgeable and very helpful...,Postive
3,I called them regarding my flight cancellation...,Postive
4,VERY GOOD SERVICE BY GAURAV LOHAT SERVED AS SO...,Postive


Finding null values

In [None]:
data.isnull().sum()

comment    16
label       0
dtype: int64

In [None]:
data = data.dropna()

In [None]:
data.shape

(308368, 2)

Checking duplicate rows

In [None]:
data.duplicated().sum()

4046

In [None]:
data = data.drop_duplicates(keep='last')

In [None]:
data.shape

(304322, 2)

In [None]:
data.head()

Unnamed: 0,comment,label
0,"Mohammad harun, he is an awesome guy very info...",Postive
1,"amazing guy gaurav was, so patience and kind. ...",Postive
2,Gaurav was very knowledgeable and very helpful...,Postive
3,I called them regarding my flight cancellation...,Postive
4,VERY GOOD SERVICE BY GAURAV LOHAT SERVED AS SO...,Postive


Removing Stop words and performing Tokenization

In [None]:
# Define a function to remove stopwords and tokenize text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words and word.isalpha()]
    return filtered_words

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Apply the preprocess_text function to the 'comment' column of the dataframe
data['comment'] = data['comment'].astype(str)
data['tokens'] = data['comment'].apply(preprocess_text)

In [None]:
# Convert tokenized text to a bag-of-words representation
dictionary = corpora.Dictionary(data['tokens'])
corpus = [dictionary.doc2bow(text) for text in data['tokens']]

In [None]:
# Train a topic model using LDA
num_topics = 5
lda = models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

In [None]:

# Get the most probable topic for each document in the collection
topics = lda.get_document_topics(corpus)
data['topic'] = [max(t, key=lambda x: x[1])[0] for t in topics]

In [None]:
# Split the data into training and testing sets
X = data['comment']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Define a vectorizer to convert text to a bag-of-words representation
vectorizer = CountVectorizer(stop_words='english')

In [None]:
# Transform the training data into a bag-of-words representation
X_train = vectorizer.fit_transform(X_train)

In [None]:
# Train a TF-IDF transformer on the training data
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train)

In [None]:
# Train a topic model on the training data
num_topics = 5
lda = LatentDirichletAllocation(n_components=num_topics, max_iter=20, random_state=42)
lda.fit(X_train_tfidf)

In [None]:
# Transform the testing data into a bag-of-words representation and apply TF-IDF weighting
X_test = vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test)

In [None]:
# Use the trained model to transform the training and testing data into topic probability vectors
X_train_topics = lda.transform(X_train_tfidf)
X_test_topics = lda.transform(X_test_tfidf)

In [None]:
# Train a logistic regression classifier on the topic probabilities and sentiment labels
lr = LogisticRegression(random_state=42)
lr.fit(X_train_topics, y_train)

In [None]:
# Use the trained classifier to predict the sentiment of the testing data
y_pred = lr.predict(X_test_topics)

In [None]:
# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.878518031709521


In [None]:
data.head()

Unnamed: 0,comment,label,tokens,topic
0,"Mohammad harun, he is an awesome guy very info...",Postive,"[Mohammad, harun, awesome, guy, informative, h...",4
1,"amazing guy gaurav was, so patience and kind. ...",Postive,"[amazing, guy, gaurav, patience, kind, helped,...",4
2,Gaurav was very knowledgeable and very helpful...,Postive,"[Gaurav, knowledgeable, helpful, knows, situat...",4
3,I called them regarding my flight cancellation...,Postive,"[called, regarding, flight, cancellation, narr...",0
4,VERY GOOD SERVICE BY GAURAV LOHAT SERVED AS SO...,Postive,"[GOOD, SERVICE, GAURAV, LOHAT, SERVED, SOON, R...",2


In [56]:
data.shape
data2=data

In [57]:
data.head(5)

Unnamed: 0,comment,label,tokens,topic,negative_tokens,negative_topics
0,"Mohammad harun, he is an awesome guy very info...",Postive,"[Mohammad, harun, awesome, guy, informative, h...",4,,
1,"amazing guy gaurav was, so patience and kind. ...",Postive,"[amazing, guy, gaurav, patience, kind, helped,...",4,,
2,Gaurav was very knowledgeable and very helpful...,Postive,"[Gaurav, knowledgeable, helpful, knows, situat...",4,,
3,I called them regarding my flight cancellation...,Postive,"[called, regarding, flight, cancellation, narr...",0,,
4,VERY GOOD SERVICE BY GAURAV LOHAT SERVED AS SO...,Postive,"[GOOD, SERVICE, GAURAV, LOHAT, SERVED, SOON, R...",2,,


In [58]:
data.to_csv('sentiment.csv', index=False)

In [63]:
# Define a function to extract frequent negative topics from the reviews
def extract_negative_topics(reviews):
    # Define a list of negative topic keywords
    negative_keywords = ["service", "delay", "baggage", "cancellation", "refund", "seat", "food", "entertainment"]
    # Flatten the list of tokens from all reviews
    all_tokens = [token for review in reviews for token in review]
    # Count the frequency of each negative topic keyword in the tokens
    topic_counts = Counter([token for token in all_tokens if token in negative_keywords])
    # Return a list of the most frequent negative topics
    return topic_counts.most_common()

data2["negative_tokens"] = data2[data2["label"] == "Negative"]["tokens"].apply(lambda x: [token for token in x])

# Extract frequent negative topics from the negative reviews
negative_reviews = data2[data2["label"] == "Negative"]
negative_topics = extract_negative_topics(negative_reviews["negative_tokens"].tolist())

# Save the frequent negative topics in a new column of the "data" dataframe
data2["negative_topics"] = ""
topic_counts = []
for topic, count in negative_topics:
    print(topic, count)
    topic_counts.append((topic, count))

# Create a DataFrame from the list of topic counts
df = pd.DataFrame(topic_counts, columns=["Topic", "Count"])

# Save the DataFrame (of negative topics count) to Excel
df.to_excel("negative_topics.xlsx", index=False)

service 12459
refund 9506
cancellation 2117
seat 1702
baggage 1294
delay 825
food 581
entertainment 59


In [64]:
df.head()

Unnamed: 0,Topic,Count
0,service,12459
1,refund,9506
2,cancellation,2117
3,seat,1702
4,baggage,1294


In [None]:
# save the dataframe to new excel
data2.to_excel("review_analysis.xlsx", index=False)

import pandas as pd
import plotly.express as px
import dash
import dash_core_components as dcc
import dash_html_components as html
import random

# Create a new column with the predicted sentiment label for each comment
#X = data['comment']
#X_tfidf = tfidf_transformer.transform(vectorizer.transform(X))
#X_topics = lda.transform(X_tfidf)
#data['predicted_label'] = lr.predict(X_topics)

# Calculate the percentage of positive, negative, and neutral comments
positive_percentage = round(len(data[data['label'] == 'Positive']) / len(data) * 100, 2)
negative_percentage = round(len(data[data['label'] == 'Negative']) / len(data) * 100, 2)
#neutral_percentage = round(len(data[data['label'] == 'neutral']) / len(data) * 100, 2)

# Choose a random comment to display
#random_comment = random.choice(data['comment'])
#random_comment_tokens = random.choice(data[data['comment'].str.contains(random_comment)]['tokens'])

# Create a bar chart of the predicted sentiment labels
fig = px.bar(data, x='label', color='label', labels={'label': 'Sentiment Label'}, 
             title='Predicted Sentiment Labels')

# Create a pie chart of the percentage of comments by sentiment label
fig2 = px.pie(data, names='label', values='comment', labels={'label': 'Sentiment Label', 
            'comment': 'Percentage of Comments'}, title='Percentage of Comments by Sentiment Label')

# Create a word cloud of random tokens from a comment
#fig3 = px.scatter(text=random_comment_tokens, title='Random Tokens from a Comment')

# Create the dashboard layout
app = dash.Dash(__name__)
app.layout = html.Div(children=[
    html.H1(children='Flysafe Airlines Dashboard', style={'textAlign': 'center'}),
    
    html.Div(children='''Sentiment Analysis of Customer Comments''',
             style={'textAlign': 'center'}),
    
    dcc.Graph(id='bar-chart', figure=fig),
    
    dcc.Graph(id='pie-chart', figure=fig2),
    
   # dcc.Graph(id='word-cloud', figure=fig3),
    
    html.Div(children=[html.P(f'Positive: {positive_percentage}%'), 
                       html.P(f'Negative: {negative_percentage}%')], 
                       #html.P(f'Neutral: {neutral_percentage}%')],
             style={'textAlign': 'center'}),
    
   # html.Div(children=[html.P(f'Random comment: {random_comment}'), 
    #                   html.P(f'Random tokens: {random_comment_tokens}')],
     #        style={'textAlign': 'center'})
])

if __name__ == '__main__':
    app.run_server()