In [None]:
!pip install lightgbm

#!pip install pandas-profiling
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip 
!pip install pyyaml==5.4.1

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
import warnings
from pandas_profiling import ProfileReport
from google.colab import files
warnings.filterwarnings("ignore")

In [14]:
train=pd.read_csv("/content/train_BRCpofr.csv")
test=pd.read_csv("/content/test_koRSKBP.csv")


In [None]:
profile = ProfileReport(train, title='EDA', html={'style':{'full_width':True}})
profile.to_file("EDA.html")
files.download("EDA.html")

In [15]:
#@title Preprocessing updated
y=train["cltv"]

#taking 5th-95th percentile of cltv based on area
cltv_5 = train.groupby(['area'])["cltv"].apply(lambda arr: np.percentile(arr,5)).rename('cltv_5')
cltv_95 = train.groupby(['area'])["cltv"].apply(lambda arr: np.percentile(arr,95)).rename('cltv_95')
train = train.merge(cltv_95, on=['area'], how='left')
train = train.merge(cltv_5, on=['area'], how='left')

#taking 5th-95th percentile of cltv based on num_policies
cltv_num_policies_5 = train.groupby(['num_policies'])["cltv"].apply(lambda arr: np.percentile(arr,5)).rename('cltv_num_policies_5')
cltv_num_policies_95 = train.groupby(['num_policies'])["cltv"].apply(lambda arr: np.percentile(arr,95)).rename('cltv_num_policies_95')
train = train.merge(cltv_num_policies_95, on=['num_policies'], how='left')
train = train.merge(cltv_num_policies_5, on=['num_policies'], how='left')

X=train.drop(["cltv","id"],axis=1)

# Standardizing numerical feature
scaler=StandardScaler()
scaler.fit(X[['claim_amount']])
X['claim_amount'] = scaler.transform(X[['claim_amount']])


scaler=StandardScaler()
scaler.fit(X[['vintage']])
X['vintage'] = scaler.transform(X[['vintage']])

scaler=StandardScaler()
scaler.fit(X[['cltv_5']])
X['cltv_5'] = scaler.transform(X[['cltv_5']])

scaler=StandardScaler()
scaler.fit(X[['cltv_95']])
X['cltv_95'] = scaler.transform(X[['cltv_95']])

scaler=StandardScaler()
scaler.fit(X[['cltv_num_policies_95']])
X['cltv_num_policies_95'] = scaler.transform(X[['cltv_num_policies_95']])

scaler=StandardScaler()
scaler.fit(X[['cltv_num_policies_5']])
X['cltv_num_policies_5'] = scaler.transform(X[['cltv_num_policies_5']])


# One-hot encode the categorical feature
X = pd.get_dummies(X, columns=["gender","area","qualification","income","num_policies","policy","type_of_policy"])

#Split your data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [11]:
#@title Train Model

# Train the LightGBM model
model = LGBMRegressor(random_state=42)
#model.fit(X_train, y_train)

# Define the parameter grid
param_grid = {
    'num_leaves': [31, 63, 127],
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 200, 300],
    'max_depth': [-1, 5, 10],
    'min_data_in_leaf': [10, 20, 30]
}

# Create the grid search object
grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1)

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

# best set of hyperparameters
best_params = grid_search.best_params_

# Train a new model using the best set of hyperparameters
best_model = LGBMRegressor(**best_params)
best_model.fit(X_train, y_train)



GridSearchCV(cv=3, estimator=LGBMRegressor(random_state=42), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.05, 0.01],
                         'max_depth': [-1, 5, 10],
                         'min_data_in_leaf': [10, 20, 30],
                         'n_estimators': [100, 200, 300],
                         'num_leaves': [31, 63, 127]})

In [13]:
#@title R2 score on seen and unseen data

y_train_pred = best_model.predict(X_train)
r2_score_train= r2_score(y_train,y_train_pred)
print("r2_score on train data ", r2_score_train)

y_test_pred = best_model.predict(X_test)
r2_score_test= r2_score(y_test,y_test_pred)
print("r2_score on unseen data ", r2_score_test)

r2_score on train data  0.1690478689285697
r2_score on unseen data  0.1640679146580054


In [None]:
#@title Predicting CLTV for the given Test File

#dataframe for storing test result
Predict_CLTV =pd.DataFrame()
Predict_CLTV['id']=test["id"]
test=test.drop(["id"],axis=1)

test = test.merge(cltv_95, on=['area'], how='left')
test = test.merge(cltv_5, on=['area'], how='left')

test = test.merge(cltv_num_policies_95, on=['num_policies'], how='left')
test = test.merge(cltv_num_policies_5, on=['num_policies'], how='left')


# Standardizing numerical feature
scaler=StandardScaler()
scaler.fit(test[['claim_amount']])
test['claim_amount'] = scaler.transform(test[['claim_amount']])

# One-hot encode the categorical feature
test = pd.get_dummies(test, columns=["gender","area","qualification","income","num_policies","policy","type_of_policy"])

Predict_CLTV['cltv'] = best_model.predict(test)
Predict_CLTV.to_csv("Predict_CLTV.csv",index=False)