In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.preprocessing import LabelEncoder,StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_log_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold

In [8]:
train_df=pd.read_csv('data/train.csv')
test_df=pd.read_csv('data/test.csv')

In [9]:
train_df

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,1199995,36.0,Female,27316.0,Married,0.0,Master's,Unemployed,13.772907,Urban,...,,5.0,372.0,3.0,2023-05-03 15:21:39.257696,Poor,No,Daily,Apartment,1303.0
1199996,1199996,54.0,Male,35786.0,Divorced,,Master's,Self-Employed,11.483482,Rural,...,,10.0,597.0,4.0,2022-09-10 15:21:39.134960,Poor,No,Weekly,Apartment,821.0
1199997,1199997,19.0,Male,51884.0,Divorced,0.0,Master's,,14.724469,Suburban,...,0.0,19.0,,6.0,2021-05-25 15:21:39.106582,Good,No,Monthly,Condo,371.0
1199998,1199998,55.0,Male,,Single,1.0,PhD,,18.547381,Suburban,...,1.0,7.0,407.0,4.0,2021-09-19 15:21:39.190215,Poor,No,Daily,Apartment,596.0


In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1181295 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1155051 non-null  float64
 4   Marital Status        1181471 non-null  object 
 5   Number of Dependents  1090328 non-null  float64
 6   Education Level       1200000 non-null  object 
 7   Occupation            841925 non-null   object 
 8   Health Score          1125924 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  object 
 11  Previous Claims       835971 non-null   float64
 12  Vehicle Age           1199994 non-null  float64
 13  Credit Score          1062118 non-null  float64
 14  Insurance Duration    1199999 non-

In [11]:
train_df=train_df.drop('Premium Amount',axis=1)

In [12]:
numeric_cols=train_df.select_dtypes(include=['int','float']).columns
object_cols=train_df.select_dtypes(include=['object']).columns

In [13]:
numeric_test_cols=test_df.select_dtypes(include=['float','int']).columns
object_test_cols=test_df.select_dtypes(include=['object']).columns

In [14]:
numeric_cols

Index(['id', 'Age', 'Annual Income', 'Number of Dependents', 'Health Score',
       'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration'],
      dtype='object')

In [15]:
train_df.isnull().sum()

id                           0
Age                      18705
Gender                       0
Annual Income            44949
Marital Status           18529
Number of Dependents    109672
Education Level              0
Occupation              358075
Health Score             74076
Location                     0
Policy Type                  0
Previous Claims         364029
Vehicle Age                  6
Credit Score            137882
Insurance Duration           1
Policy Start Date            0
Customer Feedback        77824
Smoking Status               0
Exercise Frequency           0
Property Type                0
dtype: int64

In [16]:
train_df.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House


In [17]:
train_df[object_cols].head()

Unnamed: 0,Gender,Marital Status,Education Level,Occupation,Location,Policy Type,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,Female,Married,Bachelor's,Self-Employed,Urban,Premium,2023-12-23 15:21:39.134960,Poor,No,Weekly,House
1,Female,Divorced,Master's,,Rural,Comprehensive,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House
2,Male,Divorced,High School,Self-Employed,Suburban,Premium,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House
3,Male,Married,Bachelor's,,Rural,Basic,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment
4,Male,Single,Bachelor's,Self-Employed,Rural,Premium,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House


In [18]:
100*(train_df[object_cols].isnull().sum()/len(train_df[object_cols]))

Gender                 0.000000
Marital Status         1.544083
Education Level        0.000000
Occupation            29.839583
Location               0.000000
Policy Type            0.000000
Policy Start Date      0.000000
Customer Feedback      6.485333
Smoking Status         0.000000
Exercise Frequency     0.000000
Property Type          0.000000
dtype: float64

In [19]:
100*(train_df[numeric_cols].isnull().sum()/len(train_df[numeric_cols]))

id                       0.000000
Age                      1.558750
Annual Income            3.745750
Number of Dependents     9.139333
Health Score             6.173000
Previous Claims         30.335750
Vehicle Age              0.000500
Credit Score            11.490167
Insurance Duration       0.000083
dtype: float64

In [20]:
simple_imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
le=LabelEncoder()
kni=KNNImputer(n_neighbors=5)
ohe=OneHotEncoder()
oe=OrdinalEncoder()
scaler=StandardScaler()

In [21]:
train_df[numeric_cols]=simple_imputer.fit_transform(train_df[numeric_cols])

In [22]:
test_df[numeric_test_cols]=simple_imputer.transform(test_df[numeric_test_cols])

In [23]:
object_cols

Index(['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',
       'Policy Type', 'Policy Start Date', 'Customer Feedback',
       'Smoking Status', 'Exercise Frequency', 'Property Type'],
      dtype='object')

In [24]:
object_lst_ohe=['Gender','Policy Type','Property Type','Location']
object_lst_oe=['Exercise Frequency', 'Occupation','Customer Feedback','Smoking Status','Education Level','Policy Type','Marital Status']

In [25]:
train_df['Smoking Status'].unique()

array(['No', 'Yes'], dtype=object)

In [26]:
train_df.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,0.0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House
1,1.0,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House
2,2.0,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,592.92435,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House
3,3.0,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment
4,4.0,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House


In [27]:
train_df[object_lst_oe]=oe.fit_transform(train_df[object_lst_oe])
test_df[object_lst_oe]=oe.transform(test_df[object_lst_oe])

In [28]:
missing_val_cols=['Customer Feedback','Occupation','Marital Status']
for col in missing_val_cols:
    train_df[col]=train_df[col].fillna(train_df[col].value_counts().argmax())
    test_df[col]=test_df[col].fillna(train_df[col].value_counts().argmax())

In [29]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 20 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  float64
 1   Age                   1200000 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1200000 non-null  float64
 4   Marital Status        1200000 non-null  float64
 5   Number of Dependents  1200000 non-null  float64
 6   Education Level       1200000 non-null  float64
 7   Occupation            1200000 non-null  float64
 8   Health Score          1200000 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  float64
 11  Previous Claims       1200000 non-null  float64
 12  Vehicle Age           1200000 non-null  float64
 13  Credit Score          1200000 non-null  float64
 14  Insurance Duration    1200000 non-

In [30]:
train_df['Policy Start Date']=pd.to_datetime(train_df['Policy Start Date'])
test_df['Policy Start Date']=pd.to_datetime(test_df['Policy Start Date'])

In [31]:
train_df['Policy Start Day']=train_df['Policy Start Date'].dt.day
train_df['Policy Start Month']=train_df['Policy Start Date'].dt.month
train_df['Policy Start Year']=train_df['Policy Start Date'].dt.year

test_df['Policy Start Day']=test_df['Policy Start Date'].dt.day
test_df['Policy Start Month']=test_df['Policy Start Date'].dt.month
test_df['Policy Start Year']=test_df['Policy Start Date'].dt.year

In [32]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 23 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   id                    1200000 non-null  float64       
 1   Age                   1200000 non-null  float64       
 2   Gender                1200000 non-null  object        
 3   Annual Income         1200000 non-null  float64       
 4   Marital Status        1200000 non-null  float64       
 5   Number of Dependents  1200000 non-null  float64       
 6   Education Level       1200000 non-null  float64       
 7   Occupation            1200000 non-null  float64       
 8   Health Score          1200000 non-null  float64       
 9   Location              1200000 non-null  object        
 10  Policy Type           1200000 non-null  float64       
 11  Previous Claims       1200000 non-null  float64       
 12  Vehicle Age           1200000 non-null  fl

In [33]:
train_df['Year sin']=np.sin(train_df['Policy Start Year']*2*np.pi)
test_df['Year sin']=np.sin(test_df['Policy Start Year']*2*np.pi)

train_df['Month sin']=np.sin(train_df['Policy Start Month']*2*np.pi)
test_df['Month sin']=np.sin(test_df['Policy Start Month']*2*np.pi)

train_df['Day sin']=np.sin(train_df['Policy Start Day']*2*np.pi)
test_df['Day sin']=np.sin(test_df['Policy Start Day']*2*np.pi)


In [34]:
train_df=train_df.drop('Policy Start Date',axis=1)
test_df=test_df.drop('Policy Start Date',axis=1)

In [35]:
train_df_dummy=pd.get_dummies(train_df[object_lst_ohe],drop_first=True)
test_df_dummy=pd.get_dummies(test_df[object_lst_ohe],drop_first=True)

In [36]:
train_df=train_df.drop(['Gender','Policy Type','Property Type','Location','Education Level'],axis=1)
test_df=test_df.drop(['Gender','Policy Type','Property Type','Location','Education Level'],axis=1)

In [37]:
train_df.head()

Unnamed: 0,id,Age,Annual Income,Marital Status,Number of Dependents,Occupation,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Policy Start Day,Policy Start Month,Policy Start Year,Year sin,Month sin,Day sin
0,0.0,19.0,10049.0,1.0,1.0,1.0,22.598761,2.0,17.0,372.0,5.0,2.0,0.0,3.0,23,12,2023,-6.447061e-13,-2.939152e-15,-1.27388e-14
1,1.0,39.0,31678.0,0.0,3.0,0.0,15.569731,1.0,12.0,694.0,2.0,0.0,1.0,1.0,12,6,2023,-6.447061e-13,-1.469576e-15,-2.939152e-15
2,2.0,23.0,25602.0,0.0,3.0,1.0,47.177549,1.0,14.0,592.92435,3.0,1.0,1.0,3.0,30,9,2023,-6.447061e-13,-2.204364e-15,-2.155874e-14
3,3.0,21.0,141855.0,1.0,2.0,0.0,10.938144,1.0,0.0,367.0,1.0,2.0,1.0,0.0,12,6,2024,1.585375e-14,-1.469576e-15,-2.939152e-15
4,4.0,21.0,39651.0,2.0,1.0,1.0,20.376094,0.0,8.0,598.0,4.0,2.0,1.0,3.0,1,12,2021,-1.468363e-13,-2.939152e-15,-2.449294e-16


In [38]:
train_df['C_H']=train_df['Credit Score']/train_df['Health Score']
test_df['C_H']=test_df['Credit Score']/test_df['Health Score']

In [39]:
train_df=pd.concat([train_df,train_df_dummy],axis=1)
test_df=pd.concat([test_df,test_df_dummy],axis=1)

In [40]:
train_df=train_df.drop(['Marital Status'],axis=1)
test_df=test_df.drop(['Marital Status'],axis=1)

In [41]:
train_df_dummy=pd.read_csv('data/train.csv')

In [42]:
X=train_df
y=train_df_dummy['Premium Amount']

In [43]:
X.head()

Unnamed: 0,id,Age,Annual Income,Number of Dependents,Occupation,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,...,Year sin,Month sin,Day sin,C_H,Policy Type,Gender_Male,Property Type_Condo,Property Type_House,Location_Suburban,Location_Urban
0,0.0,19.0,10049.0,1.0,1.0,22.598761,2.0,17.0,372.0,5.0,...,-6.447061e-13,-2.939152e-15,-1.27388e-14,16.46108,2.0,False,False,True,False,True
1,1.0,39.0,31678.0,3.0,0.0,15.569731,1.0,12.0,694.0,2.0,...,-6.447061e-13,-1.469576e-15,-2.939152e-15,44.573667,1.0,False,False,True,False,False
2,2.0,23.0,25602.0,3.0,1.0,47.177549,1.0,14.0,592.92435,3.0,...,-6.447061e-13,-2.204364e-15,-2.155874e-14,12.567935,2.0,True,False,True,True,False
3,3.0,21.0,141855.0,2.0,0.0,10.938144,1.0,0.0,367.0,1.0,...,1.585375e-14,-1.469576e-15,-2.939152e-15,33.55231,0.0,True,False,False,False,False
4,4.0,21.0,39651.0,1.0,1.0,20.376094,0.0,8.0,598.0,4.0,...,-1.468363e-13,-2.939152e-15,-2.449294e-16,29.348118,2.0,True,False,True,False,False


In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
y

0          2869.0
1          1483.0
2           567.0
3           765.0
4          2022.0
            ...  
1199995    1303.0
1199996     821.0
1199997     371.0
1199998     596.0
1199999    2480.0
Name: Premium Amount, Length: 1200000, dtype: float64

In [46]:
scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(X_train)
scaled_X_test=scaler.transform(X_test)

In [47]:
lr=LinearRegression()
lr.fit(scaled_X_train,y_train)
preds=lr.predict(scaled_X_test)
print(root_mean_squared_log_error(y_test,preds))

1.1683486410195996


In [48]:
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [49]:
from xgboost import XGBRegressor

In [50]:
lgb_params = {
    'n_estimators': 1175, 
    'learning_rate': 0.02991020706767896, 
    'num_leaves': 79, 
    'max_depth': 13, 
    'min_child_samples': 12, 
    'subsample': 0.9633137940297378, 
    'colsample_bytree': 0.9637121094733179, 
    'reg_alpha': 8.846561105667421, 
    'reg_lambda': 3.9007247999299173
}

cat_params = {
    'iterations': 3000, 
    'learning_rate': 0.038365175314273574, 
    'depth': 11, 
    'l2_leaf_reg': 3.596285147607088, 
    'bagging_temperature': 0.2618728648567565
}

xgb_params = {
    'n_estimators': 1078, 
    'learning_rate': 0.016084079332671603, 
    'max_depth': 10, 
    'min_child_weight': 8, 
    'subsample': 0.8732132237392727, 
    'colsample_bytree': 0.9756972730817159, 
    'reg_alpha': 3.386299962300141, 
    'reg_lambda': 8.964009483088061
}

In [51]:
def eval_model(model_type,X_train,y_train,X_test,y_test,params):
    model=model_type(**params)
    model.fit(X_train,y_train)
    preds=model.predict(X_test)
    return root_mean_squared_log_error(y_test,preds)

In [55]:
eval_model(LGBMRegressor,scaled_X_train,y_train,scaled_X_test,y_test,params=lgb_params)

[WinError 2] The system cannot find the file specified
  File "c:\Users\gavat\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\gavat\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\gavat\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\gavat\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 26
[LightGBM] [Info] Start training from score 1102.505529


1.1416273004325612

In [181]:
best_params = {
    'boosting_type': 'dart',
    'num_leaves': 384,
    'learning_rate': 0.024680120465142227,
    'feature_fraction': 0.9883068358315126,
    'bagging_fraction': 0.7201712704805496,
    'bagging_freq': 7,
    'min_data_in_leaf': 50,
    'max_depth': 15,
    'lambda_l1': 0.0011290211269753322,
    'lambda_l2': 3.056310541294088,
    'seed': 42
}

In [182]:
lgbm_model=LGBMRegressor(**best_params)
lgbm_model.fit(scaled_X_train,y_train)
preds=lgbm_model.predict(scaled_X_test)
print(root_mean_squared_log_error(y_test,preds))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016175 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 26
[LightGBM] [Info] Start training from score 1102.505529
1.0742489665272579


In [44]:
final_df=pd.DataFrame()

In [45]:
scaled_test_df=scaler.transform(test_df)

In [46]:
scaled_test_df

array([[ 1.73202678, -0.97952587, -0.96384337, ...,  1.41281266,
        -0.70998081, -0.7032703 ],
       [ 1.73202967, -0.75612433,  2.95407268, ..., -0.70780793,
         1.40848878, -0.7032703 ],
       [ 1.73203255,  0.43535053, -0.49573663, ..., -0.70780793,
        -0.70998081,  1.42192837],
       ...,
       [ 4.04107646, -1.12846023,  0.07699904, ..., -0.70780793,
        -0.70998081,  1.42192837],
       [ 4.04107934, -0.5327228 ,  0.40896786, ..., -0.70780793,
        -0.70998081,  1.42192837],
       [ 4.04108223, -1.20292741, -0.25028301, ...,  1.41281266,
         1.40848878, -0.7032703 ]])

In [None]:
final_preds=lgbm_model.predict(scaled_test_df)



In [48]:
final_df['id']=test_df['id'].astype(int)
final_df['Premium Amount']=final_preds

In [49]:
final_df['Premium Amount']

0          977.360360
1          768.469131
2          744.772168
3          741.285022
4          723.740572
             ...     
799995     831.909356
799996    1029.416711
799997     752.359938
799998     782.194150
799999     738.466034
Name: Premium Amount, Length: 800000, dtype: float64

In [50]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              800000 non-null  int32  
 1   Premium Amount  800000 non-null  float64
dtypes: float64(1), int32(1)
memory usage: 9.2 MB


In [51]:
final_df.to_csv('data/submission_4.csv',index=False)