# All Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import matplotlib.pyplot as plt

#Supress the warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Data Description

In [4]:
df.info() #cloumns data types
df.dtypes

In [None]:
df.describe() #stat description

In [6]:
df.shape #dataframe shape

In [None]:
pd.set_option('display.max_columns',None) #Setting Display options to ensure feature name visibility

In [None]:
df['columnname'].isnull().sum()

In [None]:
df.drop(['columnname'],axis=1) #use inline=True for replacing in same df

In [None]:
# numerical categorical segregation
numx=df.select_dtypes(include='number') 
catx=df.select_dtypes(include='object')

In [None]:
#Outlier Analysis of Numerical Features

numx.describe(percentiles=[0.01,0.05,0.10,0.25,0.50,0.75,0.85,0.9,0.99])

# defining a function to drop outlier before and after 1% and 99% of overall data columns
def outliercap(a):
    a=a.clip(lower=a.quantile(0.01))
    a=a.clip(upper=a.quantile(0.99))
    return(a)

#application using lambda
numx=numx.apply(lambda a:outliercap(a))
numx.describe(percentiles=[0.01,0.05,0.10,0.25,0.50,0.75,0.85,0.9,0.99]) #crosscheck

In [None]:
#concatination column wise
FInal_df=pd.concat([df1,df2],axis=1,join="inner") 

In [None]:
#Output Variation distinct count 

Y['TARGET'].value_counts()

# Encoders-Lable Encoder

In [None]:
## encode object features to numeric
from sklearn.preprocessing import LabelEncoder ,Normalizer,MinMaxScaler

lencoders = {}

for col in train.select_dtypes(include=['object']).columns:
    lencoders[col] = LabelEncoder()
    train[col] = lencoders[col].fit_transform(train[col])

# Min-Max Scaling and standardization

In [None]:
cols=['columns name']
for col in cols:
    #find minimum and maximum of that column 
    minimum=min(data[col])
    maximum=max(data[col])
    data[col]=(data[col]-minimum)/(maximum-minimum)
    
#     OR

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
print(scaler.fit(data))
print(scaler.transform(data))


from sklearn import preprocessing
scaler=preprocessing.StandardScaler()

#fit our data
scaled_cols=scaler.fit_transform(df[cols])
scaled_cols=pd.DataFrame(scaled_cols,columns=cols)
scaled_cols.head()    

for col in cols:
    data[col]=scaled_cols[col]

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X_all, Y, test_size=0.3, random_state=42)

print("Shape of Training Data",X_train.shape)
print("Shape of Testing Data",X_test.shape)
print("Response Rate in Training Data",y_train.mean())
print("Response Rate in Testing Data",y_test.mean())

# Modelling

In [None]:
# Defining Training and Cross-Validation(k-Fold) Function¶

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

def train(model,X,Y):
    
    #fitting the model
    model.fit(X_train,y_train)
    
    #predict the test set
    y_pred=model.predict(X_test)
    
    #Perform Cross-Validation
    cv_score=cross_val_score(model,X,Y,scoring='mean_squared_error',cv=5)
    cv_score=np.abs(np.mean(cv_score))
    
    print("Model Report=")
    print("MSE=",mean_squared_error(y_test,y_pred))
    print("CV_score=",cv_score)

### Different Regression Models

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(normalize=True)

train(model,predictors,target)

coef=pd.Series(model.coef_,predictors.columns).sort_values()
coef.plot(kind='bar',title='Model Coefficients')

In [None]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()

train(model,predictors,target)

coef=pd.Series(model.feature_importances_,predictors.columns).sort_values(ascending=False)
coef.plot(kind='bar',title='feature_importance')

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

train(model,predictors,target)

coef=pd.Series(model.feature_importances_,predictors.columns).sort_values(ascending=False)
coef.plot(kind='bar',title='feature_importance')

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()

train(model,predictors,target)

coef=pd.Series(model.feature_importances_,predictors.columns).sort_values(ascending=False)
coef.plot(kind='bar',title='feature_importance')

In [None]:
import xgboost as xgb
model = xgb.XGBRegressor()

train(model,predictors,target)

coef=pd.Series(model.feature_importances_,predictors.columns).sort_values(ascending=False)
coef.plot(kind='bar',title='feature_importance')

# classification Models

In [None]:
from sklearn.tree import DecisionTreeClassifier

# creating model
model = DecisionTreeClassifier(max_depth=5)

# feeding the training set into the model
model.fit(x_train, y_train)

# predicting the results for the test set
y_pred = model.predict(x_test)

# calculating the training and testing accuracies
print("Training accuracy :", model.score(x_train, y_train))
print("Testing accuracy :", model.score(x_test, y_test))

# Model Evaluation

In [None]:
#model Evaluation

from sklearn import metrics
from sklearn.metrics import confusion_matrix

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision",metrics.precision_score(y_test,y_pred))
print("Recall",metrics.recall_score(y_test,y_pred))
print("f1_score",metrics.f1_score(y_test,y_pred))

metrics.plot_confusion_matrix(model,X,Y)

# Deep Learning- Regression

In [None]:
import tensorflow 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

X_train,X_test,y_train,y_test=train_test_split(x,y,random_state=101,test_size=0.2)

#creating a model
model=Sequential([
                  Dense(32,activation='relu',kernel_initializer='he_normal'),
                  Dense(16,activation='sigmoid'),
                  Dense(8,activation='relu',kernel_initializer='he_normal'),
                  Dense(4,activation='sigmoid'),
                  Dense(2,activation='relu',kernel_initializer='he_normal'),
                  Dense(1),
])

model.compile(optimizer='SGD',loss=tensorflow.keras.losses.mae,metrics=['mae'])

random=model.fit(x,y,epochs=50,batch_size=32,validation_split=0.2)

y_pred=model.predict(data)

# Deep Learning- Classification

In [None]:
import tensorflow 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train , X_test , y_train , y_test= train_test_split(x,y , random_state = 101 , test_size = 0.2 )

model=Sequential([
                  Dense(128 , activation='relu' , kernel_initializer= 'he_normal',),
                  Dropout(0.5),

                  Dense(64, activation='sigmoid' ),
                  Dropout(0.5),

                  Dense(32, activation='relu' , kernel_initializer= 'he_normal'),
                  Dropout(0.3),

                  Dense(16, activation='sigmoid' ),
                  Dropout(0.2),

                  Dense(8, activation='relu' , kernel_initializer= 'he_normal'),
                  Dropout(0.1),

                  Dense(1 , activation='sigmoid')
])

model.compile(optimizer='SGD' , loss = 'binary_crossentropy' , metrics=['accuracy'])


random = model.fit(X_train , y_train, batch_size=32 , epochs=50 , validation_split=0.2)

y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_pred , y_test)