In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, classification_report

In [3]:
# Display full legnth of outputs
pd.options.display.max_seq_items = 200000 
pd.options.display.max_rows = 400000

In [4]:
df_test = pd.read_csv("test.csv")
df_train = pd.read_csv("train.csv")

In [5]:
df_test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [6]:
df_train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Drop the id column from both test and training dataset
df_test = df_test.drop(columns = 'ID')
df_train = df_train.drop(columns = 'ID')

#### For all column(s) where the variance is equal to zero, it is removed

In [9]:
for col in df_train.describe(include = 'number').columns:
    if df_train[col].var() == 0:
        df_train = df_train.drop(columns = col)
        df_test  = df_test.drop(columns = col)

In [10]:
for col in df_test.describe(include = 'number').columns:
    if df_test[col].var() == 0:
        df_test = df_test.drop(columns = col)
        df_train = df_train.drop(columns = col)

In [11]:
for i,x in zip(df_test.isnull().sum(),df_train.isnull().sum()):
    if (i == 1) or (x == 2):
        print("Null Value")

In [12]:
# checking for null values
print(df_train.isnull().sum().sum())
print("--------")
print(df_test.isnull().sum().sum())

0
--------
0


### No null values found

In [14]:
# Printing unique value for test and train data set
for col in df_train.describe(exclude = "number"):
    print(f"Unique values of {col}")
    print(df_train[col].unique())
    print("----------------------------------------------------------")
    print("----------------------------------------------------------")

Unique values of X0
['k' 'az' 't' 'al' 'o' 'w' 'j' 'h' 's' 'n' 'ay' 'f' 'x' 'y' 'aj' 'ak' 'am'
 'z' 'q' 'at' 'ap' 'v' 'af' 'a' 'e' 'ai' 'd' 'aq' 'c' 'aa' 'ba' 'as' 'i'
 'r' 'b' 'ax' 'bc' 'u' 'ad' 'au' 'm' 'l' 'aw' 'ao' 'ac' 'g' 'ab']
----------------------------------------------------------
----------------------------------------------------------
Unique values of X1
['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' 'h' 'z' 'j' 'o' 'u' 'p' 'n'
 'i' 'y' 'd' 'f' 'm' 'k' 'g' 'q' 'ab']
----------------------------------------------------------
----------------------------------------------------------
Unique values of X2
['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm' 'a' 'k' 'ae' 's' 'f' 'd'
 'ag' 'ay' 'ac' 'ap' 'g' 'i' 'aw' 'y' 'b' 'ao' 'al' 'h' 'x' 'au' 't' 'an'
 'z' 'ah' 'p' 'am' 'j' 'q' 'af' 'l' 'aa' 'c' 'o' 'ar']
----------------------------------------------------------
----------------------------------------------------------
Unique values of X3
['a' 'e' 'c' 'f' 'd' 'b' 'g']
--------

In [15]:
for col in df_test.describe(exclude = "number"):
    print(f"Unique values of {col}")
    print(df_test[col].unique())
    print("----------------------------------------------------------")
    print("----------------------------------------------------------")

Unique values of X0
['az' 't' 'w' 'y' 'x' 'f' 'ap' 'o' 'ay' 'al' 'h' 'z' 'aj' 'd' 'v' 'ak'
 'ba' 'n' 'j' 's' 'af' 'ax' 'at' 'aq' 'av' 'm' 'k' 'a' 'e' 'ai' 'i' 'ag'
 'b' 'am' 'aw' 'as' 'r' 'ao' 'u' 'l' 'c' 'ad' 'au' 'bc' 'g' 'an' 'ae' 'p'
 'bb']
----------------------------------------------------------
----------------------------------------------------------
Unique values of X1
['v' 'b' 'l' 's' 'aa' 'r' 'a' 'i' 'p' 'c' 'o' 'm' 'z' 'e' 'h' 'w' 'g' 'k'
 'y' 't' 'u' 'd' 'j' 'q' 'n' 'f' 'ab']
----------------------------------------------------------
----------------------------------------------------------
Unique values of X2
['n' 'ai' 'as' 'ae' 's' 'b' 'e' 'ak' 'm' 'a' 'aq' 'ag' 'r' 'k' 'aj' 'ay'
 'ao' 'an' 'ac' 'af' 'ax' 'h' 'i' 'f' 'ap' 'p' 'au' 't' 'z' 'y' 'aw' 'd'
 'at' 'g' 'am' 'j' 'x' 'ab' 'w' 'q' 'ah' 'ad' 'al' 'av' 'u']
----------------------------------------------------------
----------------------------------------------------------
Unique values of X3
['f' 'a' 'c' 'e' 'd' 

In [16]:
# Performing Label Encoding
from sklearn.preprocessing import LabelEncoder

In [17]:
lb = LabelEncoder()
for col in df_test.describe(exclude = "number"):
    df_test[col] = lb.fit_transform(df_test[col])
for col in df_train.describe(exclude = "number"):
    df_train[col] = lb.fit_transform(df_train[col])

In [18]:
# Splitting the data into X and y
X = df_train.drop(columns = 'y')
y = df_train['y']

### Using PCA for dimensionality Reduction

In [20]:
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

In [21]:
# Using standardScaler to standardize the data
st = StandardScaler()
X_scaled = st.fit_transform(X)
df_test_scaled = st.fit_transform(df_test)

In [22]:
# Using PCA with 190 components for dimensionality reduction
pca = PCA(n_components = 190)
X_pca = pd.DataFrame(pca.fit_transform(X_scaled))
df_test_pca = pd.DataFrame(pca.fit_transform(df_test_scaled))

### Using XGBoost and cross validation to get Mean Absolute error and R-Squared

In [24]:
xgb = XGBRegressor(max_depth = 3,n_estimators = 30,random_state = 25)
score = cross_validate(xgb,X_pca,y,cv=10,scoring=['neg_mean_absolute_error','r2'])
result = pd.DataFrame({
'MAE':[-score['test_neg_mean_absolute_error'].mean()],
'R-Squared':[score['test_r2'].mean()],
},index=['Result'])

In [25]:
result

Unnamed: 0,MAE,R-Squared
Result,6.180854,0.475393


#### I used XGBRegressor with no parameters but the R-squared was low, after adding max_depth and n_estimators it increased from 0.41 to 0.47 

In [27]:
# Predicting y for df_test
xgb.fit(X_pca,y)
y_pred_pca = xgb.predict(df_test_pca)

### Using XGBoost without PCA(Dimensionality Reduction)

In [64]:
xgb = XGBRegressor(max_depth = 3,n_estimators = 30,random_state = 25)
score = cross_validate(xgb,X_scaled,y,cv=10,scoring=['neg_mean_absolute_error','r2'])
result = pd.DataFrame({
'MAE':[-score['test_neg_mean_absolute_error'].mean()],
'R-Squared':[score['test_r2'].mean()],
},index=['Result'])

In [65]:
result

Unnamed: 0,MAE,R-Squared
Result,5.399357,0.572206


In [66]:
xgb.fit(X_scaled,y)
y_pred = xgb.predict(df_test_scaled)

### The results without dimensionality reduction are better as MAE is less and R2 is high as well.