In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score
from sklearn.ensemble import StackingRegressor
from sklearn.pipeline import make_pipeline

In [5]:
df = pd.read_csv("china.csv")
df

Unnamed: 0,AFP,Input,Output,Enquiry,File,Interface,Added,Changed,Deleted,Resource,Duration,DevType,AdjFactor,Effort
0,1587,774,260,340,128,0,1502,0,0,4,4.0,NewDev,1.056591,7490
1,260,9,4,3,193,41,51,138,61,2,17.0,Maint,1.040000,4150
2,152,25,33,28,42,35,163,0,0,1,9.0,NewDev,0.932515,668
3,252,151,28,8,39,0,69,153,4,1,4.0,Maint,1.115044,3238
4,292,93,0,194,20,0,0,307,0,1,13.0,Maint,0.951140,2994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,51,32,0,16,7,0,0,55,0,1,3.0,Maint,0.927273,899
495,1106,410,267,108,152,25,962,0,0,2,20.0,NewDev,1.149688,16560
496,99,13,32,3,42,5,95,0,0,1,4.0,NewDev,1.042105,526
497,56,24,0,9,14,0,47,0,0,2,6.0,NewDev,1.191489,440


In [6]:
df.drop(["Deleted", "Resource", "Changed", "Interface", "AdjFactor", "DevType"], axis=True, inplace=True)
df

Unnamed: 0,AFP,Input,Output,Enquiry,File,Added,Duration,Effort
0,1587,774,260,340,128,1502,4.0,7490
1,260,9,4,3,193,51,17.0,4150
2,152,25,33,28,42,163,9.0,668
3,252,151,28,8,39,69,4.0,3238
4,292,93,0,194,20,0,13.0,2994
...,...,...,...,...,...,...,...,...
494,51,32,0,16,7,0,3.0,899
495,1106,410,267,108,152,962,20.0,16560
496,99,13,32,3,42,95,4.0,526
497,56,24,0,9,14,47,6.0,440


In [7]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1] 

In [8]:
x_rftrain, x_rftest, y_rftrain, y_rftest = train_test_split(x, y, test_size=0.20, random_state=20)
rfr = RandomForestRegressor(n_estimators=100, )
rfr.fit(x_rftrain, y_rftrain)
print("Training Score : ", rfr.score(x_rftrain, y_rftrain))
print("Test Score : ", rfr.score(x_rftest, y_rftest))

Training Score :  0.9069133658901503
Test Score :  0.8168676030576274


In [9]:
y_pred = rfr.predict(x_rftest)

In [10]:
error = {
    "Error_Type": ["Mean Absolute Error ","Root Mean Square Error ","Mean Square Log Error ", "R2_score"],
    "RandomForest error Value": [mean_absolute_error(y_rftest, y_pred),np.sqrt(mean_squared_error(y_rftest, y_pred)),mean_squared_log_error(y_rftest, y_pred), r2_score(y_rftest, y_pred)]
}
error_rfr = pd.DataFrame(error)
error_rfr

Unnamed: 0,Error_Type,RandomForest error Value
0,Mean Absolute Error,1922.4509
1,Root Mean Square Error,2747.278708
2,Mean Square Log Error,1.145927
3,R2_score,0.816868


In [11]:
df_new = df

In [12]:
for i in df_new.columns:
    upper_boundary = df_new[i].mean()  + 3*df_new[i].std()
    lower_boundary = df_new[i].mean()  - 3*df_new[i].std()
    print(f"For column : {i} upper boundary {upper_boundary}, lower boundary {lower_boundary}")
    IQR = df_new[i].quantile(0.75) - df[i].quantile(0.25)
    lower_bridge = df_new[i].quantile(0.25) - (IQR*1.5)
    upper_bridge = df_new[i].quantile(0.75) + (IQR*1.5)
    print(f" Range of columns : {i} is from {lower_bridge} to {upper_bridge} rest are outliers !")
    df_new.loc[df_new[i] >= upper_bridge] = upper_bridge

For column : AFP upper boundary 3664.372022692393, lower boundary -2690.656591830669
 Range of columns : AFP is from -405.0 to 943.0 rest are outliers !
For column : Input upper boundary 1076.3407674594325, lower boundary -704.0602063371881
 Range of columns : Input is from -161.25 to 340.75 rest are outliers !
For column : Output upper boundary 462.16331140469896, lower boundary -263.31261000189335
 Range of columns : Output is from -154.0 to 294.0 rest are outliers !
For column : Enquiry upper boundary 389.19312216414494, lower boundary -236.10694981945554
 Range of columns : Enquiry is from -118.5 to 213.5 rest are outliers !
For column : File upper boundary 305.73228094777176, lower boundary -162.67817273133892
 Range of columns : File is from -122.5 to 241.5 rest are outliers !
For column : Added upper boundary 559.713960440238, lower boundary -249.15283819574904
 Range of columns : Added is from -221.5 to 474.5 rest are outliers !
For column : Duration upper boundary 380.94874103

In [13]:
x_new = df_new.iloc[:,:-1]
y_new = df_new.iloc[:, -1]

In [14]:
lr_list_scaled=[]
for i in range(1, 42):
    print(f"-----------------------Random State {i}---------------------------")
    x_train, x_test, y_train, y_test = train_test_split(x_new, y_new, test_size=0.30, random_state=i)
    print("Training data of independent col : ", x_train.shape)
    print("Training data of dependent col : ", y_train.shape)
    print("Test data of independent col : ", x_test.shape)
    print("Test data of dependent col : ", y_test.shape)
    lr = LinearRegression()
    lr.fit(x_train, y_train)
    print("Training Score : ", lr.score(x_train, y_train))
    print("Test Score : ", lr.score(x_test, y_test))
    lr_list_scaled.append(lr.score(x_test, y_test))
    print("\n\n\n")

-----------------------Random State 1---------------------------
Training data of independent col :  (349, 7)
Training data of dependent col :  (349,)
Test data of independent col :  (150, 7)
Test data of dependent col :  (150,)
Training Score :  0.6389332274754874
Test Score :  0.5063569738779428




-----------------------Random State 2---------------------------
Training data of independent col :  (349, 7)
Training data of dependent col :  (349,)
Test data of independent col :  (150, 7)
Test data of dependent col :  (150,)
Training Score :  0.5908016914963404
Test Score :  0.6391388226693966




-----------------------Random State 3---------------------------
Training data of independent col :  (349, 7)
Training data of dependent col :  (349,)
Test data of independent col :  (150, 7)
Test data of dependent col :  (150,)
Training Score :  0.6022141822288731
Test Score :  0.6235021161272196




-----------------------Random State 4---------------------------
Training data of independe

In [15]:
max(lr_list_scaled)

0.7118258568546747

In [16]:
x_lrtrain, x_lrtest, y_lrtrain, y_lrtest = train_test_split(x_new, y_new, test_size=0.20, random_state=32)
lr = LinearRegression()
lr.fit(x_lrtrain, y_lrtrain)
print("Training Score : ", lr.score(x_lrtrain, y_lrtrain))
print("Test Score : ", lr.score(x_lrtest, y_lrtest))

Training Score :  0.5755408060914224
Test Score :  0.7136839127491039


In [17]:
model_acc = pd.DataFrame({"Model": ["RandomForestRegression", "LinearRegression"],
'Score': [rfr.score(x_rftest, y_rftest), lr.score(x_lrtest, y_lrtest)]

})
model_acc

Unnamed: 0,Model,Score
0,RandomForestRegression,0.816868
1,LinearRegression,0.713684


In [18]:
px.histogram(model_acc, x="Model", y="Score", color="Model")

In [19]:
y_lrpred = lr.predict(x_lrtest)

In [21]:
error_rfr["LinearRegression Error"] =[mean_absolute_error(y_lrtest, y_lrpred),np.sqrt(mean_squared_error(y_lrtest, y_lrpred)),mean_squared_log_error(y_lrtest, y_lrpred), r2_score(y_lrtest, y_lrpred)]

In [22]:
error_rfr

Unnamed: 0,Error_Type,RandomForest error Value,LinearRegression Error
0,Mean Absolute Error,1922.4509,688.127807
1,Root Mean Square Error,2747.278708,953.736275
2,Mean Square Log Error,1.145927,2.097591
3,R2_score,0.816868,0.713684


In [23]:
df_new

Unnamed: 0,AFP,Input,Output,Enquiry,File,Added,Duration,Effort
0,31.5,31.5,31.5,31.5,31.5,31.5,31.5,31.5
1,260.0,9.0,4.0,3.0,193.0,51.0,17.0,4150.0
2,152.0,25.0,33.0,28.0,42.0,163.0,9.0,668.0
3,252.0,151.0,28.0,8.0,39.0,69.0,4.0,3238.0
4,292.0,93.0,0.0,194.0,20.0,0.0,13.0,2994.0
...,...,...,...,...,...,...,...,...
494,51.0,32.0,0.0,16.0,7.0,0.0,3.0,899.0
495,31.5,31.5,31.5,31.5,31.5,31.5,31.5,31.5
496,99.0,13.0,32.0,3.0,42.0,95.0,4.0,526.0
497,56.0,24.0,0.0,9.0,14.0,47.0,6.0,440.0


In [26]:
x = df_new.iloc[:, :-1]
y = df_new.iloc[:,-1]

In [27]:
estimators = [
    ('rfr', make_pipeline(RandomForestRegressor(n_estimators=100))),
    ('lr', make_pipeline(LinearRegression()))
]
clf = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())

In [28]:
scores=[]
for i in range(0, 42):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)
    clf.fit(x_train, y_train)
    scores.append(["For value of ",i, " got accuracy ", clf.score(x_test, y_test)])
    print(clf.score(x_test, y_test))

0.6507992719453434
0.533648030332884
0.6562374558150209
0.5497537710266807
0.5672016226209826
0.6104677945185406
0.7331969889132917
0.6608282236698533
0.6015591308374262
0.701513023990562
0.700552753797211
0.6692031988938323
0.635530291716959
0.6192466912226569
0.6111059921281521
0.5880581249024225
0.7183666146422026
0.5744736340091551
0.7047701723218777
0.7043113678920153
0.6394335538848195
0.5298146657964466
0.5587823050527341
0.6578601472580539
0.614995905760844
0.6273477045504013
0.6759063198092694
0.689730853695836
0.7139462554404139
0.6360295516294702
0.5614296334214157
0.691928442748377
0.7404754113924046
0.6448673245047606
0.6093580831991006
0.7747268702551187
0.6353749276838664
0.6312226262966971
0.5483990244353665
0.5370794509353622
0.5826757242791947
0.6626179510674238


In [29]:
scores

[['For value of ', 0, ' got accuracy ', 0.6507992719453434],
 ['For value of ', 1, ' got accuracy ', 0.533648030332884],
 ['For value of ', 2, ' got accuracy ', 0.6562374558150209],
 ['For value of ', 3, ' got accuracy ', 0.5497537710266807],
 ['For value of ', 4, ' got accuracy ', 0.5672016226209826],
 ['For value of ', 5, ' got accuracy ', 0.6104677945185406],
 ['For value of ', 6, ' got accuracy ', 0.7331969889132917],
 ['For value of ', 7, ' got accuracy ', 0.6608282236698533],
 ['For value of ', 8, ' got accuracy ', 0.6015591308374262],
 ['For value of ', 9, ' got accuracy ', 0.701513023990562],
 ['For value of ', 10, ' got accuracy ', 0.700552753797211],
 ['For value of ', 11, ' got accuracy ', 0.6692031988938323],
 ['For value of ', 12, ' got accuracy ', 0.635530291716959],
 ['For value of ', 13, ' got accuracy ', 0.6192466912226569],
 ['For value of ', 14, ' got accuracy ', 0.6111059921281521],
 ['For value of ', 15, ' got accuracy ', 0.5880581249024225],
 ['For value of ', 16,

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=35)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.7779092436910422

In [31]:
y_pred = clf.predict(x_test)

In [32]:
error_rfr["Stacking"] = [mean_absolute_error(y_test, y_pred),np.sqrt(mean_squared_error(y_test, y_pred)),mean_squared_log_error(y_test, y_pred), r2_score(y_test, y_pred)]

In [33]:
error_rfr

Unnamed: 0,Error_Type,RandomForest error Value,LinearRegression Error,Stacking
0,Mean Absolute Error,1922.4509,688.127807,517.426954
1,Root Mean Square Error,2747.278708,953.736275,754.231134
2,Mean Square Log Error,1.145927,2.097591,1.07024
3,R2_score,0.816868,0.713684,0.777909


In [34]:
clf.estimators_

[Pipeline(steps=[('randomforestregressor', RandomForestRegressor())]),
 Pipeline(steps=[('linearregression', LinearRegression())])]

In [36]:
clf.final_estimator_