In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
import ydata_profiling as ydata

In [None]:
# Load dataset
df = pd.read_csv("../AI Talents League - Round 1/train.csv")

### 1. Exploratory Data Analysis (EDA) ###
print("Dataset Shape:", df.shape)
display(df.head())

In [None]:
df.describe()

In [None]:
ydata.ProfileReport(df).to_file("profile_report.html")

In [None]:
df.duplicated().sum()

In [None]:
#Check for missing values
print("Missing Values Per Column %:")
print(df.isnull().sum()*100/len(df.index))

In [None]:
df.info()

In [None]:
# Feature Selection
X = df.drop(columns=["Y", "X1"])  # Drop Target and Unnecessary Column
y = df["Y"]

In [None]:
X["X9"].value_counts()

In [None]:
# Handling Missing Values
X["X9"] = X["X9"].fillna("Missing")
X["X2"] = X["X2"].fillna(X["X2"].mean())

In [None]:
# Identifying categorical & numerical features
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(12, 8))
# Compute correlation matrix
corr_matrix = X[num_features].corr()
sns.heatmap(corr_matrix.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Matrix")
plt.show()

In [None]:
# Target Variable Distribution
sns.histplot(y, kde=True, bins=30)
plt.title("Target Variable Distribution")
plt.show()

In [None]:
# Detect Outliers
plt.figure(figsize=(12, 6))
for col in num_features:
    sns.boxplot(data=df[[col]],  x=col)
    plt.xticks(rotation=90)
    plt.title(f"Boxplot for Outlier Detection of {col}")
    plt.show()

In [None]:
# Function to detect outliers using IQR
outliers = {}
for col in num_features:
        # Calculate Q1 (25th percentile) and Q3 (75th percentile)
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        # Define the bounds for non-outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Detect outliers: data points outside the bounds
        outlier_condition = (df[col] < lower_bound) | (df[col] > upper_bound)
        num_outliers = outlier_condition.sum()

        # Calculate percentage of outliers
        outliers[col] = {
            'num_outliers': num_outliers,
            'percentage_outliers': (num_outliers / len(df)) * 100
        }

# Get outlier information for each numerical feature
outlier_info = outliers

# Print outliers information
for col, info in outlier_info.items():
    print(f"Feature: {col}")
    print(f"Number of Outliers: {info['num_outliers']}")
    print(f"Percentage of Outliers: {info['percentage_outliers']:.2f}%")
    print("-" * 40)

In [None]:
# Pairplot for Feature Relationships
sns.pairplot(df, diag_kind="kde")
plt.show()

In [None]:
# Binning Feature "X8" into categories
bins = [1980, 1990, 2000, 2010]
labels = ["Very Old", "Old", "Recent"]
X["X8"] = pd.cut(df["X8"], bins=bins, labels=labels)

In [None]:
transformer = PowerTransformer(method='yeo-johnson')
#y=transformer.fit_transform(target.values.reshape(-1,1))
#y = target.ravel()

In [None]:
X["X4"]=transformer.fit_transform(X["X4"].values.reshape(-1,1))

In [None]:
# Identifying categorical & numerical features
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(exclude=[np.number]).columns.tolist()


In [None]:
# Scaling Numerical Features
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])

In [None]:
# Standardizing the values to two categories: 'Low Fat' and 'Regular'
X["X3"] = X["X3"].str.lower().str.strip()  # Convert to lowercase and remove extra spaces

# Map different spellings to a consistent format
X["X3"] = X["X3"].replace({
    'low fat': 'Low Fat',
    'lf': 'Low Fat',
    'regular': 'Regular',
    'reg': 'Regular'
})

In [None]:
 Encoding Categorical Features
ordinal_encoders = {
   "X9": OrdinalEncoder(categories=[["Missing", "Small", "Medium", "High"]]),
   "X8": OrdinalEncoder(categories=[["Very Old", "Old", "Recent"]])
}

for col, encoder in ordinal_encoders.items():
    X[col] = encoder.fit_transform(X[[col]])

In [None]:
# One-Hot Encoding Other Categorical Features
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
one_hot_features = one_hot_encoder.fit_transform(X[["X3", "X5", "X7", "X10", "X11"]])
one_hot_df = pd.DataFrame(one_hot_features, columns=one_hot_encoder.get_feature_names_out())
X = X.drop(columns=["X3", "X5", "X7", "X10", "X11"]).reset_index(drop=True)
X = pd.concat([X, one_hot_df.reset_index(drop=True)], axis=1)


In [None]:
X

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define the parameter grids for each model
param_grids = {
    'Linear Regression': {},
    'Ridge Regression': {
        'alpha': [0.1, 1, 10, 100],
        'solver': ['auto', 'saga', ]
    },
    'Lasso Regression': {
        'alpha': [0.01, 0.1, 0.5, 1, 10],
        'max_iter': [1000, 2000, 3000]
    },
    'Random Forest': {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 10, 20, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [ 'sqrt', 'log2']
    },
    'Gradient Boosting': {
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'n_estimators': [100, 200, 500],
        'max_depth': [3, 5, 10],
        'subsample': [0.7, 0.8, 0.9]
    },
    'XGBoost': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.7, 0.8],
        'colsample_bytree': [0.3, 0.],
        'gamma': [0, 0.1],
        'reg_alpha': [0, 0.1],
        'reg_lambda': [1, 5],
        'min_child_weight': [1, 3]
    },

    'Support Vector Regression': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['rbf'],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
        'epsilon': [0.01, 0.05, 0.1, 0.2]
    }
}

In [None]:
# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    #'XGBoost':XGBRegressor(tree_method='hist'),
    'Support Vector Regression': SVR()
}

In [None]:
best_results = {}
best_models = {}

# Perform GridSearchCV for each model

for name, model in tqdm(models.items(), desc="Training Models"):
    print(f"Running GridSearchCV for {name}...")
    grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='neg_mean_absolute_error', n_jobs=3)
    grid_search.fit(X_train, y_train)
    best_results[name] = -grid_search.best_score_  # Negating the score to convert back from negative MAE
    best_models[name] = grid_search.best_estimator_

# Print out the best results
for name in best_results:
    print(f"{name}: Best MAE = {best_results[name]}")

# Select the best model
best_model_name = min(best_results, key=best_results.get)
best_model = best_models[best_model_name]
print(f"Best Model: {best_model_name} with MAE of {best_results[best_model_name]}")


In [None]:
df1 = pd.DataFrame({'Actual': y_test, 'Predicted':y_train})
df2 = df1.head(10)
df2.plot(kind = 'bar')

In [None]:
test_df=pd.read_csv("../AI Talents League - Round 1/test.csv")
test_df.head()

In [None]:
test_df.drop(columns=["X1"], inplace=True)
test_df["X3"].unique()


In [None]:
# Standardizing the values to two categories: 'Low Fat' and 'Regular'
test_df["X3"] = test_df["X3"].str.lower().str.strip()  # Convert to lowercase and remove extra spaces

# Map different spellings to a consistent format
test_df["X3"] = test_df["X3"].replace({
    'low fat': 'Low Fat',
    'lf': 'Low Fat',
    'regular': 'Regular',
    'reg': 'Regular'
})


In [None]:
test_df.isnull().sum()/len(df)


In [None]:
test_df.fillna({"X2": test_df["X2"].mean(), "X9": "Missing"}, inplace=True)
test_df["X8"] = pd.cut(test_df["X8"], bins=bins, labels=labels)

In [None]:
numerical_features=test_df.select_dtypes(include=[np.number]).columns.tolist()
catergical_features=test_df.select_dtypes(exclude=[np.number]).columns.tolist()

test_df["X4"]=transformer.fit_transform(test_df["X4"].values.reshape(-1,1))

test_df[numerical_features]=scaler.fit_transform(test_df[numerical_features])

ordinal_enconder_X9=OrdinalEncoder(categories=[['Missing','Small', 'Medium', 'High']])
test_df["X9"]=ordinal_enconder_X9.fit_transform(test_df[["X9"]])
ordinal_enconder_X8=OrdinalEncoder(categories=[['Very Old', 'Old', 'Recent']])
test_df["X8"]=ordinal_enconder_X8.fit_transform(test_df[["X8"]])

one_hot_encoder=OneHotEncoder(handle_unknown='ignore',sparse_output=False)
categorical_cols=['X3','X5','X7','X10','X11']
one_hot_features=one_hot_encoder.fit_transform(test_df[categorical_cols])
one_hot_df=pd.DataFrame(one_hot_features,columns=one_hot_encoder.get_feature_names_out(categorical_cols))

# Reset index to match original DataFrame
one_hot_df.index = test_df.index

# Drop original categorical columns
test_df.drop(columns=categorical_cols, inplace=True)

# Concatenate the new one-hot encoded DataFrame with the original features
test_df = pd.concat([test_df, one_hot_df], axis=1)
test_df

In [None]:
y_pred = best_model.predict(test_df)
pd.DataFrame({'row_id': test_df.index, 'Y': y_pred}).to_csv('../AI Talents League - Round 1/predictionsTXGBOOST.csv', index=False)
