In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("/kaggle/input/mlp-term-2-2025-kaggle-assignment-3/train.csv")
test_data = pd.read_csv("/kaggle/input/mlp-term-2-2025-kaggle-assignment-3/test.csv")

In [None]:
train_data.shape

In [None]:
test_data.shape

## Identify data types of different columns

In [None]:
train_data.info()

In [None]:
train_data.dtypes

In [None]:
test_data.info()

In [None]:
test_data.dtypes

## Present descriptive statistics of numerical columns

In [None]:
train_data.describe()

In [None]:
test_data.describe()

## Identify and handle the missing values

In [None]:
train_data.isna().sum().sum()

In [None]:
test_data.isna().sum()

In [None]:
train_data.dropna(inplace=True) 
train_data.isnull().sum()

## Identify and handle duplicates

In [None]:
duplicated = train_data[train_data.duplicated()]
print(duplicated)

### Duplicate values not present

In [None]:
duplicated = test_data[test_data.duplicated()]
print(duplicated)

In [None]:
print(train_data.duplicated().sum())
print(test_data.duplicated().sum())

## Identify and handle outliers

### capping for removing outlier

In [None]:
def cap_outliers_iqr(train_data):
    df_capped = train_data.copy()
    
    for col in train_data.select_dtypes(include='number').columns:
        Q1 = train_data[col].quantile(0.25)
        Q3 = train_data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        
        df_capped[col] = train_data[col].clip(lower=lower_bound, upper=upper_bound)
    
    return df_capped

In [None]:
train_capped = cap_outliers_iqr(train_data)

print("Original rows:", train_data.shape[0])
print("Rows after capping:", train_capped.shape[0])

In [None]:
test_capped = cap_outliers_iqr(test_data)


print("Original rows:", test_data.shape[0])
print("Rows after capping:", test_capped.shape[0])

In [None]:
train_data['rating_count'] = train_data['rating_count'].str.replace(',', '').astype(float)

In [None]:
outlier_features = ['id', 'latitude ', 'longitude', 'rating', "rating_count"]
outlier_map = {}
for feature in outlier_features:
    q1_val = train_data[feature].quantile(0.25)
    q3_val = train_data[feature].quantile(0.75)
    iqr_val = q3_val - q1_val
    lower_lim = q1_val - 1.5 * iqr_val
    upper_lim = q3_val + 1.5 * iqr_val
    detected_outliers = train_data[(train_data[feature] < lower_lim) | (train_data[feature] > upper_lim)]
    outlier_map[feature] = detected_outliers
    print(f"{feature}: {detected_outliers.shape[0]} outliers")

In [None]:
q1_rc = train_data['rating_count'].quantile(0.25)
q3_rc = train_data['rating_count'].quantile(0.75)
iqr_rc = q3_rc - q1_rc
low_rc = q1_rc - 1.5 * iqr_rc
up_rc = q3_rc + 1.5 * iqr_rc

train_data['rating_count'] = train_data['rating_count'].clip(low_rc, up_rc)
train_data = train_data[train_data["rating_count"] != 0]

In [None]:
outlier_map = {}
for feature in outlier_features:
    q1_val = train_data[feature].quantile(0.25)
    q3_val = train_data[feature].quantile(0.75)
    iqr_val = q3_val - q1_val
    lower_lim = q1_val - 1.5 * iqr_val
    upper_lim = q3_val + 1.5 * iqr_val
    detected_outliers = train_data[(train_data[feature] < lower_lim) | (train_data[feature] > upper_lim)]
    outlier_map[feature] = detected_outliers
    print(f"{feature}: {detected_outliers.shape[0]} outliers")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=train_data['rating_count'])
plt.title('Boxplot of rating_count after handling outliers')
plt.xlabel('rating_count')
plt.show()

In [None]:
train_data.rename(columns={'latitude ': 'latitude'}, inplace=True)

## Present at least three visualizations and provide insights for the same

### 1. Distribution of Customer Ratings

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(train_data['rating'], bins=5, kde=True)
plt.title('Distribution of Customer Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()


### 2. Correlation Heatmap of Numerical Features

In [None]:
numerical_features = ['latitude', 'longitude', 'rating_count', 'rating']
plt.figure(figsize=(10, 6))
sns.heatmap(train_data[numerical_features].corr(), annot=True, cmap="RdYlBu")
plt.title("Correlation Heatmap of Numerical Features")
plt.show()

### 3. Number of Reviews per Rating

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(x='rating', data=train_data)
plt.title('Number of Reviews per Rating')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

### 4. Line graph for Average Rating vs Latitude

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create binned latitude
train_data['lat_bin'] = train_data['latitude'].round()

# Compute average rating per latitude bin
lat_avg_rating = train_data.groupby('lat_bin')['rating'].mean().reset_index()

# Plot line graph
plt.figure(figsize=(10, 5))
sns.lineplot(data=lat_avg_rating, x='lat_bin', y='rating', marker='o')

plt.xlabel("Latitude (binned)")
plt.ylabel("Average Rating")
plt.title("Average Rating vs Latitude")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
train_data.drop(columns=['lat_bin'], inplace=True)

## Scale Numerical features and Encode Categorical features

In [None]:
import random
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

RANDOM_STATE = 42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

train_data.columns = train_data.columns.str.strip()
test_data.columns = test_data.columns.str.strip()

def split_date_parts(dataframe):
    dataframe['review_time'] = pd.to_datetime(dataframe['review_time'], errors='coerce')
    dataframe['review_year'] = dataframe['review_time'].dt.year
    dataframe['review_month'] = dataframe['review_time'].dt.month
    dataframe['review_day'] = dataframe['review_time'].dt.day
    dataframe = dataframe.drop(columns=['review_time'])
    return dataframe

train_data = split_date_parts(train_data)
test_data = split_date_parts(test_data)

In [None]:
target_feature = 'rating'
X_features = train_data.drop(columns=[target_feature])
y_target = train_data[target_feature]

numeric_cols = X_features.select_dtypes(include=['number']).columns.tolist()
categorical_cols = [c for c in X_features.select_dtypes(include=['object', 'category']).columns if c != 'review']
text_cols = ['review']

In [None]:
num_transformer = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_transformer = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

text_transformer = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='')),
    ('flatten', FunctionTransformer(lambda x: x.ravel(), validate=False)),
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2)))
])

data_preprocessor = ColumnTransformer([
    ('num', num_transformer, numeric_cols),
    ('cat', cat_transformer, categorical_cols),
    ('txt', text_transformer, text_cols)
])

## Model Building (at least 7)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
model_candidates = {
    "MLP": MLPClassifier(hidden_layer_sizes=(128, 64), early_stopping=True,
                         validation_fraction=0.1, alpha=1e-4, learning_rate_init=0.001,
                         max_iter=500, random_state=RANDOM_STATE),
    "LogReg": LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    "RF": RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE),
    "GB": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "DT": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "LinSVC": LinearSVC(random_state=RANDOM_STATE),
    "ET": ExtraTreesClassifier(n_estimators=100, random_state=RANDOM_STATE)
}

In [None]:
from time import time as timer

min_class_size = y_target.value_counts().min()
cv_splits = min(5, min_class_size) if min_class_size > 1 else 2
cv_strategy = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=RANDOM_STATE)

top_cv_score = -1
chosen_model = None

for mdl_name, mdl_obj in model_candidates.items():
    print(f"\n=== Training {mdl_name} ===")
    full_pipeline = Pipeline([
        ('prep', data_preprocessor),
        ('model', mdl_obj)
    ])
    scores = cross_val_score(full_pipeline, X_features, y_target, cv=cv_strategy, scoring='f1_macro', n_jobs=-1)
    print(f"{mdl_name} CV f1_macro: {scores.mean():.4f} ± {scores.std():.4f}")
    if scores.mean() > top_cv_score:
        top_cv_score = scores.mean()
        chosen_model = full_pipeline

## Hyperparameter Tuning on any 3 of the models

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

search_spaces = {
    "ET": {
        "model__n_estimators": randint(100, 500),
        "model__max_depth": [None, 10, 20, 30],
        "model__min_samples_split": randint(2, 10),
        "model__min_samples_leaf": randint(1, 5)
    },
    "RF": {
        "model__n_estimators": randint(100, 500),
        "model__max_depth": [None, 10, 20, 30],
        "model__min_samples_split": randint(2, 10),
        "model__min_samples_leaf": randint(1, 5)
    },
    "MLP": {
        "model__hidden_layer_sizes": [(128,), (128, 64), (256, 128)],
        "model__alpha": uniform(1e-5, 1e-3),
        "model__learning_rate_init": uniform(0.0001, 0.01)
    }
}

best_model_results = {}
for search_name, base_model in [("ET", ExtraTreesClassifier(random_state=RANDOM_STATE)),
                                ("RF", RandomForestClassifier(random_state=RANDOM_STATE)),
                                ("MLP", MLPClassifier(max_iter=500, early_stopping=True, random_state=RANDOM_STATE))]:
    print(f"Tuning {search_name}...")
    search_pipeline = Pipeline([("prep", data_preprocessor), ("model", base_model)])
    rand_search = RandomizedSearchCV(search_pipeline, param_distributions=search_spaces[search_name],
                                     n_iter=10, scoring="f1_macro", cv=3, n_jobs=-1, random_state=RANDOM_STATE)
    rand_search.fit(X_features, y_target)
    best_model_results[search_name] = rand_search.best_estimator_
    print(f"Best params for {search_name}: {rand_search.best_params_}")
    print(f"Best CV score: {rand_search.best_score_:.4f}")

## Comparison of model performances

In [None]:
model_results = {
    "MLP": 0.5997,
    "LogReg": 0.5672,
    "RF": 0.6113,
    "GB": 0.5224 ,
    "DT": 0.5575,
    "LinSVC": 0.5883,
    "ET": 0.6135
}

results_df = pd.DataFrame(list(model_results.items()), columns=['Model', 'Accuracy'])
results_df = results_df.sort_values(by='Accuracy', ascending=False)

plt.figure(figsize=(12, 6))
bars = sns.barplot(x='Accuracy', y='Model', data=results_df, palette='viridis')

for i, (acc) in enumerate(results_df['Accuracy']):
    plt.text(acc + 0.002, i, f'{acc:.4f}', va='center')

plt.title('Model Accuracy Comparison')
plt.xlabel('Validation Accuracy')
plt.ylabel('Model')
plt.show()

In [None]:
print(f"\nBest model: {chosen_model.named_steps['model'].__class__.__name__} ({top_cv_score:.4f})")
chosen_model.fit(X_features, y_target)

for c in test_data.columns:
    test_data[c] = test_data[c].replace({',': ''}, regex=True)
test_data = test_data.apply(pd.to_numeric, errors='ignore')

final_preds = chosen_model.predict(test_data)

submission_template = pd.read_csv("/kaggle/input/mlp-term-2-2025-kaggle-assignment-3/sample_submission.csv")
final_submission = submission_template.copy()
final_submission[target_feature] = final_preds
final_submission.to_csv("submission.csv", index=False)
print("✅ submission.csv saved!")