# Load the dataset

In [1]:
import pandas as pd
import numpy as np
Dataset = pd.read_csv(r'C:\Users\user\Documents\BritishAirwaysNew.csv', encoding = 'latin1')

# Text Extraction and Preprocessing

In [2]:
import re # Regular Expression
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt_tab')

# Preprocess Text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
Dataset['clean_text'] = Dataset['body'].astype(str).apply(preprocess_text)
Dataset['clean_text']

0       worst experience life trying deal customer ser...
1       due code sharing cathay pacific downgraded ba ...
2       lhr check quick first wing quickly security fi...
3       wouldnt recommend british airways tried call c...
4       absolutely horrible experience booked ticket e...
                              ...                        
3523    lhrjfklaxlhr check ok apart snapped early chec...
3524    lhr ham purser addresses club passengers name ...
3525    son worked british airways urged fly british a...
3526    london citynew york jfk via shannon really nic...
3527    sinlhr ba b first class old aircraft seats pri...
Name: clean_text, Length: 3528, dtype: object

# Feature Engineering

In [4]:
# Vectorize The Text
from sklearn.feature_extraction.text import TfidfVectorizer
Convert_text = TfidfVectorizer(max_features=5000)
X = Convert_text.fit_transform(Dataset['clean_text'])

In [5]:
# Create another column by Classifying Sentiments through overall_rating
Dataset['overall_rating'] = pd.to_numeric(Dataset['overall_rating'], errors='coerce')
Dataset.dropna(subset=['overall_rating'], inplace=True)
Dataset['sentiment'] = Dataset['overall_rating'].apply(lambda x: 1 if x > 3 else 0)
Dataset['sentiment']

0       0
1       0
2       1
3       0
4       0
       ..
3523    1
3524    1
3525    1
3526    1
3527    1
Name: sentiment, Length: 3524, dtype: int64

# Model Building and Evaluation

In [6]:
#Split the data into Train and Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Dataset['sentiment'], test_size=0.2, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [3528, 3524]

In [8]:
print(X.shape, Dataset['sentiment'].shape)

(3528, 5000) (3524,)


In [10]:
#print(X.isnull().sum(), Dataset['sentiment'].isnull().sum())


AttributeError: 'csr_matrix' object has no attribute 'isnull'

In [None]:
#Train a model and make a prediction for Sentiment Classification using LogisticRegression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression()
model.fit(X_train, y_train)
y_hat = model.predict(X_test)
print("Sentiment Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
# Predicting Satisfaction Score using RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("Satisfaction Score Prediction Accuracy:", accuracy_score(y_test, y_pred))


In [None]:
# Customer Clustering
from sklearn.cluster import KMeans
Cus_Clus = KMeans(n_clusters=3, random_state=42)
Cus_Clus.fit(X.toarray())
Dataset['cluster'] = Cus_Clus.labels_

In [None]:
#Visualize Clusters
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=Dataset['cluster'], palette='viridis')
plt.title("Customer Clusters")
plt.show()