In [1]:
# import the modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")


### Load the dataset

- Load the train data and using all your knowledge try to explore the different statistical properties of the dataset.

In [2]:
# Code starts here
train = pd.read_csv('train.csv')
#print(train.head(5))
print(train.shape)
#print(train.info())
#print(train.describe())
print(train.skew())


# Code ends here

     Id  Unnamed: 0      attr0      attr1      attr2      attr3      attr4  \
0  2216        2216  -4.374765  13.819856  14.656331  -9.728919 -19.334897   
1  2673        2673 -13.796261  -4.647589  21.676617  -0.122074  11.228644   
2  5603        5603  -2.115400  -3.332400  -6.640000 -13.825000   4.123200   
3  6401        6401 -25.531000  66.699000 -13.025000 -31.198000  12.016000   
4  6043        6043  18.993000  -5.620000  -9.964900   3.307200   0.999760   

       attr5     attr6      attr7  ...    attr1080   attr1081   attr1082  \
0   0.344455  11.10572  21.977302  ...   89.083581  86.194838  93.162055   
1  -8.806895  -9.16119  18.025709  ...  100.750899  83.373142  76.902208   
2  27.365000   6.70020   3.783000  ...   52.917000  34.799000  42.562000   
3  19.365000   5.04510  20.418000  ...   49.488000  71.633000  66.757000   
4 -10.920000 -11.39200   3.918500  ...   84.508000  89.976000  61.169000   

     attr1083    attr1084    attr1085    attr1086    attr1087    attr1088 

### Visualize the data

- Check the distribution of the target variable. Is the data imbalanced?
- Clean the data, apply some data preprocessing and engineering techniques.

In [None]:
# Code starts here

train['attr1089'].value_counts().plot(kind='bar')

plt.figure(figsize=(32,28))
sns.heatmap(train.corr(),cmap='viridis',annot=True)

# Code ends here.

### Model building

- Split the data into train and test.
- Now let's come to the actual task, predict the values of `attr1089` after building a Machine learning model.
- Try improving upon the `roc_auc_score` ([ROC-AUC Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score))

In [3]:
# Code Starts here
X = train.iloc[:,:-1]
y = train.iloc[:,-1]

# Separate into train and test data
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=42,test_size=0.2)

# Code ends here

In [8]:
# Fitting a Random Forest model on the train data and predict on the test data
# It will take a long time to execute
model = RandomForestClassifier(n_estimators=100,random_state=8,n_jobs=4)
rfe = RFE(model, 42)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_val)
model.fit(X_train_rfe,y_train)
y_pred_rfe = model.predict(X_test_rfe)
print(roc_auc_score(y_val,y_pred_rfe))

0.9207839262187089


### Prediction on the test data and creating the sample submission file.

- Load the test data and store the `Id` column in a separate variable.
- Perform the same operations on the test data that you have performed on the train data.
- Create the submission file as a `csv` file consisting of the `Id` column from the test data and your prediction as the second column.

In [9]:
# Code Starts here

test = pd.read_csv('test.csv')
# Storing the id from the test file
id_ = test['Id']

# Applying rfe on test data
test_rfe = rfe.transform(test)

# Predict on the test data
y_pred_test = model.predict(test_rfe)

# Create a sample submission file
sample_submission_result = pd.DataFrame({'Id':id_,'attr1089':y_pred_test})

# Convert the sample submission file into a csv file
sample_submission_result.to_csv('sample_submission_result.csv',index=False)

# Code ends here