<a href="https://colab.research.google.com/github/Rohankohli08/Project-1-/blob/main/MI3P1_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reading in the data and splitting the columns

In [None]:
import pandas as pd
df = pd.read_csv(
    "Amazon_Reviews.csv",
    engine="python",
    on_bad_lines="warn"
)
df[['year_posted', 'month_posted', 'daytime_posted']] = df['Review Date'].str.split('-', expand=True)
df[['month_ex', 'day_ex','year_ex']] = df['Date of Experience'].str.split(' ', expand=True)
df[['review count', 'extra']] = df['Review Count'].str.split(' ', expand=True)
df['rating'] = df['Rating'].str.split().str[1]
df['rating'] = pd.to_numeric(df['rating'])
df['review count'] = pd.to_numeric(df['review count'])

Cleaning the data based on analysis plan

In [None]:
import numpy as np
clean_df = df[['Country',
              'Review Title',
              'Review Text',
              'year_posted',
              'month_posted',
              'month_ex',
              'year_ex',
              'review count',
              'rating'
]]
clean_df = clean_df[(clean_df['year_posted'] == '2024')|(clean_df['year_posted'] == '2023')]
clean_df
clean_df = clean_df[clean_df['rating'] != 3]
clean_df['rating_group'] = np.where(
    clean_df['rating'] <= 2,
    'low',
    'high'
)
clean_df

Unnamed: 0,Country,Review Title,Review Text,year_posted,month_posted,month_ex,year_ex,review count,rating,rating_group
0,US,A Store That Doesn't Want to Sell Anything,"I registered on the website, tried to order a ...",2024,09,September,2024,1.0,1.0,low
1,GB,Had multiple orders one turned up and…,Had multiple orders one turned up and driver h...,2024,09,September,2024,9.0,1.0,low
2,GB,I informed these reprobates,I informed these reprobates that I WOULD NOT B...,2024,09,September,2024,90.0,1.0,low
3,AU,Advertise one price then increase it on website,I have bought from Amazon before and no proble...,2024,09,September,2024,5.0,1.0,low
4,GB,If I could give a lower rate I would,If I could give a lower rate I would! I cancel...,2024,09,September,2024,8.0,1.0,low
...,...,...,...,...,...,...,...,...,...,...
12244,GB,Update on previous.,I just wanted to update on my previous review ...,2024,04,December,2020,8.0,1.0,low
17109,US,Amazon Selling Dangerous Product 12-22-2018,I have only written two reviews in the past. I...,2024,01,December,2018,18.0,2.0,low
17614,LT,"Had minor problems, but overall - great","I had some problems with them, such as fake se...",2023,12,August,2018,36.0,5.0,high
18465,US,Being flooded with far too many Chinese …,Amazon has become similar to a flea market of ...,2024,02,July,2017,6.0,1.0,low


In [None]:
!pip install vaderSentiment



In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
# Drop rows with missing review text
model_df = clean_df.dropna(subset=['Review Text'])

# Encode target variable
label_encoder = LabelEncoder()
model_df['rating_binary'] = label_encoder.fit_transform(model_df['rating_group'])
# high = 1, low = 0
model_df['rating_binary'] = 1 - model_df['rating_binary']

model_df[['rating_group', 'rating_binary']].head()

Unnamed: 0,rating_group,rating_binary
0,low,0
1,low,0
2,low,0
3,low,0
4,low,0


In [None]:
# Sentiment analysis with VADER
analyzer = SentimentIntensityAnalyzer()

def vader_scores(text):
    scores = analyzer.polarity_scores(str(text))
    return pd.Series([
        scores['compound'],
        scores['pos'],
        scores['neu'],
        scores['neg']
    ])

model_df[['sent_compound', 'sent_pos', 'sent_neu', 'sent_neg']] = (
    model_df['Review Text'].apply(vader_scores)
)

model_df[['sent_compound', 'sent_pos', 'sent_neu', 'sent_neg']].head()

Unnamed: 0,sent_compound,sent_pos,sent_neu,sent_neg
0,0.5499,0.11,0.809,0.081
1,-0.695,0.0,0.864,0.136
2,-0.9276,0.0,0.889,0.111
3,-0.6525,0.1,0.755,0.145
4,-0.812,0.075,0.768,0.157


In [None]:
# Train/test split stratified
X_sent = model_df[['sent_compound', 'sent_pos', 'sent_neu', 'sent_neg']]
y = model_df['rating_binary']

X_train, X_test, y_train, y_test = train_test_split(
    X_sent, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Training size:", X_train.shape)
print("Test size:", X_test.shape)

# Logistic regression
model_sent = LogisticRegression(max_iter=1000, class_weight={0: 1, 1: 3})
model_sent.fit(X_train, y_train)

# Model evaluation
y_pred_sent = model_sent.predict(X_test)

sent_results = {
    "Accuracy": accuracy_score(y_test, y_pred_sent),
    "Precision": precision_score(y_test, y_pred_sent),
    "Recall": recall_score(y_test, y_pred_sent),
    "F1": f1_score(y_test, y_pred_sent)
}

print(sent_results)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_sent, target_names=['Low', 'High']))

Training size: (5256, 4)
Test size: (1314, 4)
{'Accuracy': 0.8888888888888888, 'Precision': 0.5815217391304348, 'Recall': 0.6079545454545454, 'F1': 0.5944444444444444}

Classification Report:
              precision    recall  f1-score   support

         Low       0.94      0.93      0.94      1138
        High       0.58      0.61      0.59       176

    accuracy                           0.89      1314
   macro avg       0.76      0.77      0.77      1314
weighted avg       0.89      0.89      0.89      1314



In [None]:
# Prep data (additional modeling)
model_df['month_posted'] = pd.to_numeric(model_df['month_posted'], errors='coerce')

top_countries = model_df['Country'].value_counts().nlargest(5).index
model_df['Country_grouped'] = model_df['Country'].where(
    model_df['Country'].isin(top_countries),
    'Other'
)

country_dummies = pd.get_dummies(model_df['Country_grouped'], drop_first=True)

X_full = pd.concat([
    model_df[['sent_compound', 'sent_pos', 'sent_neu', 'sent_neg',
              'review count', 'month_posted']],
    country_dummies
], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X_full, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

model_full = LogisticRegression(max_iter=1000)
model_full.fit(X_train, y_train)

y_pred_full = model_full.predict(X_test)

full_results = {
    "Accuracy": accuracy_score(y_test, y_pred_full),
    "Precision": precision_score(y_test, y_pred_full),
    "Recall": recall_score(y_test, y_pred_full),
    "F1": f1_score(y_test, y_pred_full)
}

full_results

{'Accuracy': 0.9079147640791476,
 'Precision': 0.8767123287671232,
 'Recall': 0.36363636363636365,
 'F1': 0.5140562248995983}

In [None]:
# Feature importance combined
feature_importance = pd.DataFrame({
    'Feature': X_full.columns,
    'Coefficient': model_full.coef_[0],
    'Abs_Coefficient': np.abs(model_full.coef_[0])
}).sort_values(by='Abs_Coefficient', ascending=False)

feature_importance

Unnamed: 0,Feature,Coefficient,Abs_Coefficient
3,sent_neg,-7.312382,7.312382
1,sent_pos,7.272081,7.272081
2,sent_neu,-1.026122,1.026122
7,IN,0.993165,0.993165
0,sent_compound,0.893139,0.893139
9,Other,0.715269,0.715269
8,NL,-0.075685,0.075685
10,US,0.022051,0.022051
6,GB,-0.006134,0.006134
4,review count,-0.001494,0.001494


In [None]:
#testing chunk