### **Import Necessary Library** 

In [1]:
#Import necessay libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)
import seaborn as sns

#Preprocessing
from sklearn import model_selection,metrics
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,LabelEncoder
#Model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,roc_auc_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.metrics import mean_squared_log_error,mean_squared_error
from catboost import CatBoostRegressor



### **Read Data&Shape**

In [2]:
#import the data and shape
train = pd.read_csv("../input/song-popularity-prediction/train.csv")
test = pd.read_csv("../input/song-popularity-prediction/test.csv")
sample=pd.read_csv("../input/song-popularity-prediction/sample_submission.csv")
print(train.shape,test.shape,sample.shape)
print(train.isnull().sum())
train.describe().transpose()

### **Identify Null Value in Graphical Representation**

In [3]:


plt.rc('figure',figsize= (10,12))
sns.set_context('paper',font_scale=1)

plt.title('Missing value status',fontweight = 'bold')
ax = sns.heatmap(train.isnull().sum().to_frame(),annot=True,fmt = 'd',cmap = 'RdYlGn')
ax.set_xlabel('Amount Missing')
plt.show()



### **Apply SimpleImputer(Median)** 

In [4]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
train_im = pd.DataFrame(imputer.fit_transform(train))
test_im = pd.DataFrame(imputer.fit_transform(test))
#remove column
train_im.columns = train.columns
test_im.columns = test.columns

train = train_im
test = test_im

### **KFold**

In [5]:
#insert the kfold columns
train['kfold'] = -1
#distributing the data
kfold = KFold(n_splits = 5,shuffle=True,random_state = 42)
for fold, (tr_i,va_i) in enumerate(kfold.split(X=train)):
    train.loc[va_i,'kfold'] = fold
    
print(train.kfold.value_counts())
train.to_csv("folds_5.csv",index=False)
print("successfully folds")


### **Identify_Nullvalue**

In [6]:


plt.rc('figure',figsize= (10,12))
sns.set_context('paper',font_scale=1)

plt.title('Missing value status',fontweight = 'bold')
ax = sns.heatmap(train.isnull().sum().to_frame(),annot=True,fmt = 'd',cmap = 'RdYlGn')
ax.set_xlabel('Amount Missing')
plt.show()



### **Feature_Correlation**

In [7]:
# Plot dataframe
heat = train.corr().round(5)

# Mask to hide upper-right part of plot as it is a duplicate
mask = np.zeros_like(heat)
mask[np.triu_indices_from(mask)] = True

# Making a plot
plt.figure(figsize=(16,16))
ax = sns.heatmap(heat, annot=False, mask=mask, cmap="RdYlGn", annot_kws={"weight": "bold", "fontsize":13})
ax.set_title("Feature correlation heatmap", fontsize=17)
plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
         rotation_mode="anchor", weight="normal")
plt.setp(ax.get_yticklabels(), weight="normal",
         rotation_mode="anchor", rotation=0, ha="right")
plt.show();


### **Feature Represent**

In [9]:
df = pd.read_csv("./folds_5.csv")

#features taken to train
features = [f for f in df.columns if f not in("id","kfold","song_popularity")]
test= test[features]

### **XGB & Prediction_Output**

In [10]:
prediction = []
score = []

for fold in range (5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    ytrain = xtrain.song_popularity
    yvalid = xvalid.song_popularity
    
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    lE = StandardScaler()
    xtrain[features] = lE.fit_transform(xtrain[features])
    xvalid[features] = lE.transform(xvalid[features])
    xtest[features] = lE.transform(xtest[features])
    
    
    #Model hyperparameter of XGboostRegressor
    xgb_params = {
        'learning_rate': 0.001235,
        'subsample': 0.95312,
        'colsample_bytree': 0.1107,
        'max_depth': 3,
        'booster': 'gbtree', 
        'reg_lambda': 66.156,
        'reg_alpha': 14.68267919457715,
        'random_state':42,
        'n_estimators':15000
    }
    
    model= XGBRegressor(**xgb_params,
                       gpu_id=0,
                       tree_method='gpu_hist',
                       predictor='gpu_predictor')
    model.fit(xtrain,ytrain,early_stopping_rounds=100,eval_set=[(xvalid,yvalid)],verbose=False)
    preds_valid = model.predict(xvalid)
    
    #Training model apply the test data and predict the output
    test_predict = model.predict(xtest)
    prediction.append(test_predict)
    roc1= roc_auc_score(yvalid,preds_valid)
    #Score 
    score.append(roc1)
    print(f"fold|split:{fold},roc:{roc1}")
    
print(np.mean(score),np.std(score))

In [12]:
#reconfigure of split data
final_predict = np.mean(np.column_stack(prediction),axis=1)
print(final_predict)
sample.song_popularity = final_predict
sample.to_csv("submission_xgb.csv",index=False)
print("Final achieve to send xgboost output data")


## **Thankyou_Guys**