<a href="https://colab.research.google.com/github/SatoruGojo9/TDS-ASSAIGNMENT/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# -*- coding: utf-8 -*-
"""BDM Assignment - 02
Linear Regression for Credit Limit Prediction
"""

from google.colab import files
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Upload the dataset
print("Please upload your dataset file:")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

# Read the dataset
# Adjust the parsing based on your file type (CSV, Excel, etc.)
if file_name.endswith('.csv'):
    df = pd.read_csv(file_name)
elif file_name.endswith('.xlsx') or file_name.endswith('.xls'):
    df = pd.read_excel(file_name)
else:
    print("Unsupported file format. Please upload CSV or Excel file.")

print("\n" + "="*60)
print("Dataset Information")
print("="*60)
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nData types:")
print(df.dtypes)
print(f"\nMissing values:")
print(df.isnull().sum())

# ============================================================================
# Data Preparation
# ============================================================================
print("\n" + "="*60)
print("Data Preparation")
print("="*60)

# Separate features (X) and target variable (y)
# Assuming 'Credit_Limit' is the last column (target variable)
X = df.iloc[:, :-1]  # All columns except the last one
y = df.iloc[:, -1]   # Last column (Credit_Limit)

print(f"Feature matrix (X) shape: {X.shape}")
print(f"Target variable (y) shape: {y.shape}")
print(f"Target variable name: {df.columns[-1]}")

# Split dataset into train and test sets (70:30 ratio, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42
)

print(f"\nTrain set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# ============================================================================
# Question 1: Train Linear Regression and Calculate R² Score
# ============================================================================
print("\n" + "="*60)
print("Question 1: R² Score on Test Dataset")
print("="*60)

# Train Linear Regression model with fit_intercept=False
lr_model = LinearRegression(fit_intercept=False)
lr_model.fit(X_train, y_train)

# Make predictions on test set
y_pred = lr_model.predict(X_test)

# Calculate R² score on test dataset
r2_test = r2_score(y_test, y_pred)

print(f"R² Score on Test Dataset: {r2_test:.3f}")
print(f"Answer for Question 1: {r2_test:.3f}")

# ============================================================================
# Question 2: Feature with Highest Absolute Coefficient Value
# ============================================================================
print("\n" + "="*60)
print("Question 2: Feature with Highest Absolute Coefficient")
print("="*60)

# Get coefficients
coefficients = lr_model.coef_

# Find index of feature with highest absolute coefficient value
abs_coefficients = np.abs(coefficients)
max_coef_index = np.argmax(abs_coefficients)

print(f"\nAll coefficients:")
for idx, (col, coef) in enumerate(zip(X.columns, coefficients)):
    print(f"  Index {idx} ({col}): {coef:.6f} (abs: {abs(coef):.6f})")

print(f"\nFeature with highest absolute coefficient:")
print(f"  Index: {max_coef_index}")
print(f"  Feature name: {X.columns[max_coef_index]}")
print(f"  Coefficient value: {coefficients[max_coef_index]:.6f}")
print(f"  Absolute value: {abs_coefficients[max_coef_index]:.6f}")

print(f"\nAnswer for Question 2: {max_coef_index}")

# ============================================================================
# Summary
# ============================================================================
print("\n" + "="*60)
print("FINAL ANSWERS")
print("="*60)
print(f"Question 1 - R² Score (3 decimal places): {r2_test:.3f}")
print(f"Question 2 - Feature Index with Highest |Coefficient|: {max_coef_index}")
print("="*60)

Please upload your dataset file:


Saving GA_5_dataset.csv to GA_5_dataset.csv

Dataset Information
Shape: (6000, 17)

Columns: ['Marital_Status_Married', 'Marital_Status_Single', 'Attrition_Flag_Existing Customer', 'Gender_M', 'Education_Level', 'Income_Category', 'Card_Category', 'Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Total_Revolving_Bal', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Credit_Limit']

First few rows:
   Marital_Status_Married  Marital_Status_Single  \
0                1.069045              -0.921798   
1                1.069045              -0.921798   
2               -0.935414               1.084837   
3               -0.935414               1.084837   
4               -0.935414               1.084837   

   Attrition_Flag_Existing Customer  Gender_M  Education_Level  \
0                          0.439814 -0.951190         0.834712   
1                          0.439814 -0.951190        -0.499271   
2               

In [2]:
# ============================================================================
# Questions 3 & 4: Ridge Regression Model
# ============================================================================
from sklearn.linear_model import Ridge

print("\n" + "="*60)
print("Question 3 & 4: Ridge Regression Model")
print("="*60)

# Train Ridge model with specified parameters
ridge_model = Ridge(
    solver='sag',
    tol=0.0005,
    random_state=42
)

# Fit the model on training data
ridge_model.fit(X_train, y_train)

# Make predictions on test set
y_pred_ridge = ridge_model.predict(X_test)

# Calculate R² score on test dataset
r2_test_ridge = r2_score(y_test, y_pred_ridge)

# Get the intercept value
intercept_value = ridge_model.intercept_

print(f"\nModel Training Complete!")
print(f"Parameters used:")
print(f"  - solver: 'sag'")
print(f"  - tol: 0.0005")
print(f"  - random_state: 42")

# ============================================================================
# Question 3: R² Score
# ============================================================================
print("\n" + "-"*60)
print("Question 3: R² Score on Test Dataset")
print("-"*60)
print(f"R² Score on Test Dataset: {r2_test_ridge:.3f}")
print(f"\nAnswer for Question 3: {r2_test_ridge:.3f}")

# ============================================================================
# Question 4: Intercept Value
# ============================================================================
print("\n" + "-"*60)
print("Question 4: Intercept Value")
print("-"*60)
print(f"Intercept (intercept_): {intercept_value:.3f}")
print(f"\nAnswer for Question 4: {intercept_value:.3f}")

# ============================================================================
# Additional Information
# ============================================================================
print("\n" + "-"*60)
print("Ridge Model Coefficients:")
print("-"*60)
for idx, (col, coef) in enumerate(zip(X.columns, ridge_model.coef_)):
    print(f"  Index {idx} ({col}): {coef:.6f}")

# ============================================================================
# Summary
# ============================================================================
print("\n" + "="*60)
print("FINAL ANSWERS FOR QUESTIONS 3 & 4")
print("="*60)
print(f"Question 3 - R² Score (3 decimal places): {r2_test_ridge:.3f}")
print(f"Question 4 - Intercept (3 decimal places): {intercept_value:.3f}")
print("="*60)


Question 3 & 4: Ridge Regression Model

Model Training Complete!
Parameters used:
  - solver: 'sag'
  - tol: 0.0005
  - random_state: 42

------------------------------------------------------------
Question 3: R² Score on Test Dataset
------------------------------------------------------------
R² Score on Test Dataset: 0.503

Answer for Question 3: 0.503

------------------------------------------------------------
Question 4: Intercept Value
------------------------------------------------------------
Intercept (intercept_): 8638.308

Answer for Question 4: 8638.308

------------------------------------------------------------
Ridge Model Coefficients:
------------------------------------------------------------
  Index 0 (Marital_Status_Married): -800.313930
  Index 1 (Marital_Status_Single): -510.178008
  Index 2 (Attrition_Flag_Existing Customer): 265.341929
  Index 3 (Gender_M): -236.792194
  Index 4 (Education_Level): -31.522020
  Index 5 (Income_Category): 4498.588425
  Index

In [3]:
# ============================================================================
# Questions 5 & 6: Lasso Regression Model
# ============================================================================
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

print("\n" + "="*60)
print("Questions 5 & 6: Lasso Regression Model")
print("="*60)

# Train Lasso model with specified parameters
lasso_model = Lasso(
    alpha=100,
    random_state=42
)

# Fit the model on training data
lasso_model.fit(X_train, y_train)

# Make predictions on test set
y_pred_lasso = lasso_model.predict(X_test)

# Calculate R² score on test dataset
r2_test_lasso = r2_score(y_test, y_pred_lasso)

print(f"\nModel Training Complete!")
print(f"Parameters used:")
print(f"  - alpha: 100")
print(f"  - random_state: 42")

# ============================================================================
# Question 5: R² Score
# ============================================================================
print("\n" + "-"*60)
print("Question 5: R² Score on Test Dataset")
print("-"*60)
print(f"R² Score on Test Dataset: {r2_test_lasso:.3f}")
print(f"\nAnswer for Question 5: {r2_test_lasso:.3f}")

# ============================================================================
# Question 6: Count Coefficients in Range [-1, 1]
# ============================================================================
print("\n" + "-"*60)
print("Question 6: Coefficients in Range [-1, 1]")
print("-"*60)

# Get all coefficients
lasso_coefficients = lasso_model.coef_

# Count coefficients in range [-1, 1]
coef_in_range = np.sum((lasso_coefficients >= -1) & (lasso_coefficients <= 1))

print(f"\nLasso Model Coefficients:")
for idx, (col, coef) in enumerate(zip(X.columns, lasso_coefficients)):
    in_range = "✓" if -1 <= coef <= 1 else "✗"
    print(f"  Index {idx} ({col}): {coef:.6f} [{in_range}]")

print(f"\nNumber of coefficients in range [-1, 1]: {coef_in_range}")
print(f"\nAnswer for Question 6: {coef_in_range}")

# ============================================================================
# Question 7: KNeighborsRegressor Model
# ============================================================================
print("\n" + "="*60)
print("Question 7: KNeighborsRegressor Model")
print("="*60)

# Train KNeighborsRegressor with specified parameters
knn_model = KNeighborsRegressor(
    n_neighbors=10,
    p=1
)

# Fit the model on training data
knn_model.fit(X_train, y_train)

# Make predictions on test set
y_pred_knn = knn_model.predict(X_test)

# Calculate RMSE on test dataset
mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)

print(f"\nModel Training Complete!")
print(f"Parameters used:")
print(f"  - n_neighbors: 10")
print(f"  - p: 1 (Manhattan distance)")

print("\n" + "-"*60)
print("Question 7: RMSE on Test Dataset")
print("-"*60)
print(f"Mean Squared Error (MSE): {mse_knn:.3f}")
print(f"Root Mean Squared Error (RMSE): {rmse_knn:.3f}")
print(f"\nAnswer for Question 7: {rmse_knn:.3f}")

# ============================================================================
# Summary of All Three Questions
# ============================================================================
print("\n" + "="*60)
print("FINAL ANSWERS FOR QUESTIONS 5, 6 & 7")
print("="*60)
print(f"Question 5 - Lasso R² Score (3 decimal places): {r2_test_lasso:.3f}")
print(f"Question 6 - Coefficients in range [-1, 1]: {coef_in_range}")
print(f"Question 7 - KNN RMSE (3 decimal places): {rmse_knn:.3f}")
print("="*60)

# ============================================================================
# Additional Comparison (Optional)
# ============================================================================
print("\n" + "="*60)
print("Model Performance Comparison")
print("="*60)
print(f"Linear Regression R²: -0.411 (from Q1)")
print(f"Ridge Regression R²:   0.503 (from Q3)")
print(f"Lasso Regression R²:   {r2_test_lasso:.3f} (from Q5)")
print(f"KNN RMSE:              {rmse_knn:.3f} (from Q7)")
print("="*60)


Questions 5 & 6: Lasso Regression Model

Model Training Complete!
Parameters used:
  - alpha: 100
  - random_state: 42

------------------------------------------------------------
Question 5: R² Score on Test Dataset
------------------------------------------------------------
R² Score on Test Dataset: 0.501

Answer for Question 5: 0.501

------------------------------------------------------------
Question 6: Coefficients in Range [-1, 1]
------------------------------------------------------------

Lasso Model Coefficients:
  Index 0 (Marital_Status_Married): -197.631340 [✗]
  Index 1 (Marital_Status_Single): -0.000000 [✓]
  Index 2 (Attrition_Flag_Existing Customer): 0.000000 [✓]
  Index 3 (Gender_M): 0.000000 [✓]
  Index 4 (Education_Level): -0.000000 [✓]
  Index 5 (Income_Category): 4289.971423 [✗]
  Index 6 (Card_Category): 3753.346678 [✗]
  Index 7 (Customer_Age): 0.000000 [✓]
  Index 8 (Dependent_count): 62.846822 [✗]
  Index 9 (Months_on_book): 0.000000 [✓]
  Index 10 (Total

In [4]:
# ============================================================================
# Question 8: Decision Tree Regressor
# ============================================================================
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

print("\n" + "="*60)
print("Question 8: Decision Tree Regressor")
print("="*60)

# Train Decision Tree Regressor with specified parameters
dt_model = DecisionTreeRegressor(
    max_depth=10,
    min_samples_split=6,
    min_samples_leaf=6,
    random_state=42
)

# Fit the model on training data
dt_model.fit(X_train, y_train)

# Make predictions on test set
y_pred_dt = dt_model.predict(X_test)

# Calculate RMSE on test dataset
mse_dt = mean_squared_error(y_test, y_pred_dt)
rmse_dt = np.sqrt(mse_dt)

print(f"\nModel Training Complete!")
print(f"Parameters used:")
print(f"  - max_depth: 10")
print(f"  - min_samples_split: 6")
print(f"  - min_samples_leaf: 6")
print(f"  - random_state: 42")

print("\n" + "-"*60)
print("Question 8: RMSE on Test Dataset")
print("-"*60)
print(f"Mean Squared Error (MSE): {mse_dt:.3f}")
print(f"Root Mean Squared Error (RMSE): {rmse_dt:.3f}")
print(f"\nAnswer for Question 8: {rmse_dt:.3f}")

# ============================================================================
# Questions 9 & 10: AdaBoost with GridSearchCV
# ============================================================================
print("\n" + "="*60)
print("Questions 9 & 10: AdaBoost with GridSearchCV")
print("="*60)

# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [10, 50, 100, 200, 500],
    'learning_rate': [0.1, 0.5, 1, 2]
}

print(f"\nHyperparameter Grid:")
print(f"  - n_estimators: {param_grid['n_estimators']}")
print(f"  - learning_rate: {param_grid['learning_rate']}")
print(f"  - Total combinations: {len(param_grid['n_estimators']) * len(param_grid['learning_rate'])}")

# Create AdaBoostRegressor
ada_model = AdaBoostRegressor(random_state=42)

# Perform GridSearchCV
print(f"\nPerforming GridSearchCV with cv=4...")
print("This may take a few moments...")

grid_search = GridSearchCV(
    estimator=ada_model,
    param_grid=param_grid,
    cv=4,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

print(f"\nGridSearchCV Complete!")

# Get the best model
best_ada_model = grid_search.best_estimator_

# Get best parameters
best_params = grid_search.best_params_
best_n_estimators = best_params['n_estimators']
best_learning_rate = best_params['learning_rate']

print("\n" + "-"*60)
print("Best Parameters Found:")
print("-"*60)
print(f"  - n_estimators: {best_n_estimators}")
print(f"  - learning_rate: {best_learning_rate}")
print(f"  - Best CV Score (R²): {grid_search.best_score_:.4f}")

# Make predictions on test set using best model
y_pred_ada = best_ada_model.predict(X_test)

# Calculate R² score on test dataset
r2_test_ada = r2_score(y_test, y_pred_ada)

# ============================================================================
# Question 9: R² Score
# ============================================================================
print("\n" + "-"*60)
print("Question 9: R² Score on Test Dataset")
print("-"*60)
print(f"R² Score on Test Dataset: {r2_test_ada:.3f}")
print(f"\nAnswer for Question 9: {r2_test_ada:.3f}")

# ============================================================================
# Question 10: Best n_estimators
# ============================================================================
print("\n" + "-"*60)
print("Question 10: Best n_estimators Value")
print("-"*60)
print(f"Best n_estimators: {best_n_estimators}")
print(f"\nAnswer for Question 10: {best_n_estimators}")

# ============================================================================
# Additional Information: Top 5 Parameter Combinations
# ============================================================================
print("\n" + "-"*60)
print("Top 5 Parameter Combinations (by CV Score):")
print("-"*60)

# Get results from GridSearchCV
results_df = pd.DataFrame(grid_search.cv_results_)
results_df = results_df.sort_values('rank_test_score')

for idx, row in results_df.head(5).iterrows():
    print(f"  Rank {int(row['rank_test_score'])}: "
          f"n_estimators={row['param_n_estimators']}, "
          f"learning_rate={row['param_learning_rate']}, "
          f"CV Score={row['mean_test_score']:.4f}")

# ============================================================================
# Summary of All Questions
# ============================================================================
print("\n" + "="*60)
print("FINAL ANSWERS FOR QUESTIONS 8, 9 & 10")
print("="*60)
print(f"Question 8  - Decision Tree RMSE (3 decimal places): {rmse_dt:.3f}")
print(f"Question 9  - AdaBoost R² Score (3 decimal places): {r2_test_ada:.3f}")
print(f"Question 10 - Best n_estimators: {best_n_estimators}")
print("="*60)

# ============================================================================
# Complete Model Performance Summary
# ============================================================================
print("\n" + "="*60)
print("COMPLETE MODEL PERFORMANCE SUMMARY")
print("="*60)
print(f"Q1  - Linear Regression R²:    -0.411")
print(f"Q3  - Ridge Regression R²:      0.503")
print(f"Q5  - Lasso Regression R²:      0.501")
print(f"Q7  - KNN RMSE:                 6707.056")
print(f"Q8  - Decision Tree RMSE:       {rmse_dt:.3f}")
print(f"Q9  - AdaBoost R²:              {r2_test_ada:.3f}")
print("="*60)


Question 8: Decision Tree Regressor

Model Training Complete!
Parameters used:
  - max_depth: 10
  - min_samples_split: 6
  - min_samples_leaf: 6
  - random_state: 42

------------------------------------------------------------
Question 8: RMSE on Test Dataset
------------------------------------------------------------
Mean Squared Error (MSE): 45438841.015
Root Mean Squared Error (RMSE): 6740.834

Answer for Question 8: 6740.834

Questions 9 & 10: AdaBoost with GridSearchCV

Hyperparameter Grid:
  - n_estimators: [10, 50, 100, 200, 500]
  - learning_rate: [0.1, 0.5, 1, 2]
  - Total combinations: 20

Performing GridSearchCV with cv=4...
This may take a few moments...
Fitting 4 folds for each of 20 candidates, totalling 80 fits

GridSearchCV Complete!

------------------------------------------------------------
Best Parameters Found:
------------------------------------------------------------
  - n_estimators: 10
  - learning_rate: 0.1
  - Best CV Score (R²): 0.5073

--------------