In [1]:
import pandas as pd
import numpy as np 
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn import preprocessing

#import feature selection modules
from sklearn.feature_selection import mutual_info_classif,RFE,RFECV

#import classification modules
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

#import classification evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve 
from sklearn.metrics import f1_score
from sklearn.metrics import auc

In [114]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR

In [3]:
def load_data():
    dmfraud = pd.read_csv('Customer Churn.csv')
    return (dmfraud)
dmfraud = load_data()

In [4]:
def manual_split(dmfraud,split,random):
    #make a copy of the label column and store in y
    y = dmfraud['ChurnIndicator'].copy()
    
    #now delete the original
    X = dmfraud.drop('ChurnIndicator',axis=1)
    
    #manual split
    trainX, testX, trainY, testY= train_test_split(X, y, test_size=split, random_state=random)
    
    return X, trainX, testX, trainY, testY, y

In [49]:
def validationmetrics_regression(model,testX,testY):
    predictions = model.predict(testX)
    print("Prediction Vector: \n", predictions)
    
    # predictions = predicted values for testing data
    # testY = original values
    
    #MSE, if sqrt then gives RMSE
    print("RMSE: ", sqrt(mean_squared_error(testY, predictions)))
    
    #R-squared score
    print("R-squared score: ", r2_score(testY, predictions))
    
    #R-squared score (other methods) #multioutput = 'variance_weighted'/raw_values/uniform_average
    print("R-squared score : ", r2_score(testY, predictions, multioutput='uniform_average'))
    

In [33]:
# Linear regression
def LinReg(dmfraud, trainX, testX, trainY, testY):
    clf = LinearRegression()
    clf = clf.fit(trainX, trainY)
    validationmetrics_regression(clf, testX, testY)

In [42]:
# Random Forest
def RandFor(dmfraud, trainX, testX, trainY, testY):
    clf = RandomForestRegressor(max_depth = 2, random_state = 0)
    clf = clf.fit(trainX, trainY)
    validationmetrics_regression(clf, testX, testY)

In [None]:
# Support Vector Regression (SVR)
def SupVecReg (dmfraud, trainX, trainY, testX, testY):
    clf = SVR ()

In [110]:
# Polynomial Regression
# linear regression ki straight line hti hai, agr data thora upr neche hai tu we need a curve to fit it,
# tu uske liey hmn x, x^2, x^3 chahiye hga, so we need to add polynomial degree then implement it to linearRegression model 
def PolReg(dmfraud, trainX, testX, trainY, testY):
    
    poly = PolynomialFeatures(degree = 5)
    x_poly = poly.fit_transform(trainX)
    testX=poly.fit_transform(testX)
    clf = LinearRegression()
    clf.fit(x_poly, trainY)
    validationmetrics_regression(clf, testX, testY)

In [111]:
def MachineLearningwithRFFS():
    
    dmfraud = load_data()
    dmfraud, trainX, testX, trainY, testY, y = manual_split(dmfraud,0.2,91) 
    dmfraud = RFE(dmfraud, trainX, trainY,0.2)
    
    print ('LINEAR REGRESSION ALGORITHM\n')
    LinReg (dmfraud, trainX, testX, trainY, testY)
    
    print ('\n\nRANDOM FOREST CLASSIFIER\n')
    RandFor (dmfraud, trainX, testX, trainY, testY)
    
    print ('\n\nPOLYNOMIAL REGRESSION\n')
    PolReg (dmfraud, trainX, testX, trainY, testY)

In [112]:
MachineLearningwithRFFS()

LINEAR REGRESSION ALGORITHM

Prediction Vector: 
 [0.07653662 0.05581566 0.07862634 ... 0.02470191 0.02023542 0.03330086]
RMSE:  0.04970669603285239
R-squared score:  0.34755497679497716
R-squared score :  0.34755497679497716


RANDOM FOREST CLASSIFIER

Prediction Vector: 
 [0.05805544 0.03935325 0.09730818 ... 0.01386627 0.01386627 0.0294916 ]
RMSE:  0.044648234866395596
R-squared score:  0.4735917135347907
R-squared score :  0.4735917135347907


POLYNOMIAL REGRESSION

Prediction Vector: 
 [0.05461371 0.05592902 0.10811369 ... 0.01248154 0.01094884 0.02221973]
RMSE:  0.04018131771234626
R-squared score:  0.5736537190878842
R-squared score :  0.5736537190878842


