In [1]:
from scipy.io import arff
import numpy as np
import pandas as pd

In [2]:
#load arff file
data=arff.loadarff('albrecht.arff')
df = pd.DataFrame(data[0])
df

Unnamed: 0,Input,Output,Inquiry,File,FPAdj,RawFPcounts,AdjFP,Effort
0,25.0,150.0,75.0,60.0,1.0,1750.0,1750.0,102.4
1,193.0,98.0,70.0,36.0,1.0,1902.0,1902.0,105.2
2,70.0,27.0,0.0,12.0,0.8,535.0,428.0,11.1
3,40.0,60.0,20.0,12.0,1.15,660.0,759.0,21.1
4,10.0,69.0,1.0,9.0,0.9,478.89,431.0,28.8
5,13.0,19.0,0.0,23.0,0.75,377.33,283.0,10.0
6,34.0,14.0,0.0,5.0,0.8,256.25,205.0,8.0
7,17.0,17.0,15.0,5.0,1.1,262.73,289.0,4.9
8,45.0,64.0,14.0,16.0,0.95,715.79,680.0,12.9
9,40.0,60.0,20.0,15.0,1.15,690.43,794.0,19.0


In [3]:
x = df.drop('Effort',axis=1)
y=df['Effort']

In [4]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((19, 7), (5, 7), (19,), (5,))

In [5]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

# LinearRegression

In [6]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(x_train,y_train)

LinearRegression()

In [7]:
print('intercept:',regressor.intercept_)
print('slope:',regressor.coef_)

intercept: 18.442105263157814
slope: [-429.25221365 -377.14821747 -165.86820226 -292.69199407  -11.44748357
 1043.048543     50.48713753]


In [8]:
y_preds=regressor.predict(x_test)
y_preds,y_test

(array([24.0234246 , 17.59389012, 86.09936026, 29.99421347, 15.57552499]),
 8      12.9
 16     18.3
 0     102.4
 18     38.1
 11      2.9
 Name: Effort, dtype: float64)

In [9]:
from sklearn import metrics
from sklearn.metrics import r2_score
print("pred",(np.abs(y_test - y_preds) <= 2 * metrics.mean_absolute_error(y_test,y_preds)).mean())
print("r2:",r2_score(y_test, y_preds, multioutput='variance_weighted'))
print("mae:",metrics.mean_absolute_error(y_test,y_preds))
print("mse:",metrics.mean_squared_error(y_test,y_preds))
print("rmse:",np.sqrt(metrics.mean_squared_error(y_test,y_preds)))

pred 1.0
r2: 0.9029436106870561
mae: 9.782297146570889
mse: 123.26254616877604
rmse: 11.102366692231708


In [10]:
# Calculate the absolute difference between predicted and actual values
abs_diff = abs(y_test - y_preds)

# Count the number of predictions within the threshold of 0.25
within_threshold = abs_diff[abs_diff < 0.25].count()

# Calculate the PRED metric
PRED = (within_threshold / len(y_preds)) * 100

print(f"PRED: {PRED}%")


PRED: 0.0%


# RandomForestRegression

In [11]:
from sklearn.ensemble import RandomForestRegressor
RFregressor=RandomForestRegressor()
RFregressor.fit(x_train,y_train)
RFregressor.score(x_test,y_test)

0.849622300492846

In [12]:
y_predrf = RFregressor.predict(x_test)
y_predrf,y_test

(array([16.61 , 16.911, 75.685, 52.292,  7.81 ]),
 8      12.9
 16     18.3
 0     102.4
 18     38.1
 11      2.9
 Name: Effort, dtype: float64)

In [48]:
from sklearn import metrics
print('pred:',(np.abs(y_test - y_predrf) <= 2 * metrics.mean_absolute_error(y_test,y_predrf)).mean())
print('r^2:',r2_score(y_test, y_predrf, multioutput='variance_weighted'))
print("maerf:",metrics.mean_absolute_error(y_test,y_predrf))
print("mserf:",metrics.mean_squared_error(y_test,y_predrf))
print("rmserf:",np.sqrt(metrics.mean_squared_error(y_test,y_predrf)))

pred: 0.6666666666666666
r^2: 0.34251249070599926
maerf: 59.9161333333334
mserf: 5448.407587600012
rmserf: 73.81332933556115


In [40]:
# Calculate the absolute difference between predicted and actual values
abs_diff = abs(y_test - y_predrf)

# Count the number of predictions within the threshold of 0.25
within_threshold = abs_diff[abs_diff < 0.25].count()

# Calculate the PRED metric
PRED = (within_threshold / len(y_predrf)) * 100

print(f"PRED: {PRED}%")


PRED: 0.0%


# Ridge

In [14]:
from sklearn.linear_model import Ridge
ridge=Ridge()
ridge.fit(x_train,y_train)
ridge.score(x_test,y_test)

0.9251370155775103

In [15]:
y_predr=ridge.predict(x_test)
y_predr,y_test

(array([27.17344333, 19.1867688 , 87.27324154, 33.83844928,  7.78736791]),
 8      12.9
 16     18.3
 0     102.4
 18     38.1
 11      2.9
 Name: Effort, dtype: float64)

In [49]:
from sklearn import metrics
from sklearn.metrics import r2_score
print("pred:",(np.abs(y_test - y_predr) <= 2 * metrics.mean_absolute_error(y_test,y_predr)).mean())
print("r2",r2_score(y_test, y_predr, multioutput='variance_weighted'))
print("maer:",metrics.mean_absolute_error(y_test,y_predr))
print("mser:",metrics.mean_squared_error(y_test,y_predr))
print("rmser:",np.sqrt(metrics.mean_squared_error(y_test,y_predr)))

pred: 1.0
r2 -2.8350533572646244
maer: 159.23852715881148
mser: 31779.97074500794
rmser: 178.2693769131646


In [41]:
# Calculate the absolute difference between predicted and actual values
abs_diff = abs(y_test - y_predr)

# Count the number of predictions within the threshold of 0.25
within_threshold = abs_diff[abs_diff < 0.25].count()

# Calculate the PRED metric
PRED = (within_threshold / len(y_predr)) * 100

print(f"PRED: {PRED}%")


PRED: 0.0%


# SVM

In [17]:
from sklearn.svm import LinearSVR
svr=LinearSVR(max_iter=1000)
svr.fit(x_train,y_train)
svr.score(x_test,y_test)

0.28946796618066817

In [18]:
y_predsvr=svr.predict(x_test)
y_predsvr,y_test

(array([12.69176881, 11.26437357, 38.35786581, 19.11043246,  2.29131797]),
 8      12.9
 16     18.3
 0     102.4
 18     38.1
 11      2.9
 Name: Effort, dtype: float64)

In [50]:
from sklearn import metrics
from sklearn.metrics import r2_score
print("pred:",(np.abs(y_test - y_predsvr) <= 2 * metrics.mean_absolute_error(y_test,y_predsvr)).mean())
print("r2",r2_score(y_test, y_predsvr, multioutput='variance_weighted'))
print("maerf:",metrics.mean_absolute_error(y_test,y_predsvr))
print("mserf:",metrics.mean_squared_error(y_test,y_predsvr))
print("rmserf:",np.sqrt(metrics.mean_squared_error(y_test,y_predsvr)))

pred: 1.0
r2 -4.110782085047033
maerf: 184.5666666666667
mserf: 42351.56333333333
rmserf: 205.79495458667915


In [42]:
# Calculate the absolute difference between predicted and actual values
abs_diff = abs(y_test - y_predsvr)

# Count the number of predictions within the threshold of 0.25
within_threshold = abs_diff[abs_diff < 0.25].count()

# Calculate the PRED metric
PRED = (within_threshold / len(y_predsvr)) * 100

print(f"PRED: {PRED}%")


PRED: 0.0%


# 2nd Dataset

In [20]:
#load arff file
data=arff.loadarff('kemerer.arff')
dff = pd.DataFrame(data[0])
dff

Unnamed: 0,ID,Language,Hardware,Duration,KSLOC,AdjFP,RAWFP,EffortMM
0,1.0,1.0,1.0,17.0,253.6,1217.1,1010.0,287.0
1,2.0,1.0,2.0,7.0,40.5,507.3,457.0,82.5
2,3.0,1.0,3.0,15.0,450.0,2306.8,2284.0,1107.31
3,4.0,1.0,1.0,18.0,214.4,788.5,881.0,86.9
4,5.0,1.0,2.0,13.0,449.9,1337.6,1583.0,336.3
5,6.0,1.0,4.0,5.0,50.0,421.3,411.0,84.0
6,7.0,2.0,4.0,5.0,43.0,99.9,97.0,23.2
7,8.0,1.0,2.0,11.0,200.0,993.0,998.0,130.3
8,9.0,1.0,1.0,14.0,289.0,1592.9,1554.0,116.0
9,10.0,1.0,1.0,5.0,39.0,240.0,250.0,72.0


In [21]:
dff.drop('ID', axis=1, inplace=True)
dff

Unnamed: 0,Language,Hardware,Duration,KSLOC,AdjFP,RAWFP,EffortMM
0,1.0,1.0,17.0,253.6,1217.1,1010.0,287.0
1,1.0,2.0,7.0,40.5,507.3,457.0,82.5
2,1.0,3.0,15.0,450.0,2306.8,2284.0,1107.31
3,1.0,1.0,18.0,214.4,788.5,881.0,86.9
4,1.0,2.0,13.0,449.9,1337.6,1583.0,336.3
5,1.0,4.0,5.0,50.0,421.3,411.0,84.0
6,2.0,4.0,5.0,43.0,99.9,97.0,23.2
7,1.0,2.0,11.0,200.0,993.0,998.0,130.3
8,1.0,1.0,14.0,289.0,1592.9,1554.0,116.0
9,1.0,1.0,5.0,39.0,240.0,250.0,72.0


In [22]:
dff.shape

(15, 7)

In [23]:
x=dff.drop('EffortMM',axis=1)
y=dff['EffortMM']

In [24]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((12, 6), (3, 6), (12,), (3,))

In [25]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

# LinearRegression

In [26]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(x_train,y_train)

LinearRegression()

In [27]:
print('intercept:',regressor.intercept_)
print('slope:',regressor.coef_)

intercept: 224.91750000000002
slope: [  4.67064048 132.86518059 -52.27710863 -37.35539941  45.67017066
 284.34393227]


In [28]:
y_predsm=regressor.predict(x_test)
y_predsm,y_test

(array([-231.18733778,  125.49158483,   41.72700896]),
 9      72.0
 11    230.7
 0     287.0
 Name: EffortMM, dtype: float64)

In [29]:
from sklearn import metrics
from sklearn.metrics import r2_score
print("pred:",(np.abs(y_test - y_predsm) <= 2 * metrics.mean_absolute_error(y_test,y_predsm)).mean())
print("r2",r2_score(y_test, y_predsm, multioutput='variance_weighted'))
print("mae:",metrics.mean_absolute_error(y_test,y_predsm))
print("mse:",metrics.mean_squared_error(y_test,y_predsm))
print("rmse:",np.sqrt(metrics.mean_squared_error(y_test,y_predsm)))

pred: 1.0
r2 -5.56272651918603
mae: 217.88958132888158
mse: 54383.404181885715
rmse: 233.20249608845467


In [44]:
# Calculate the absolute difference between predicted and actual values
abs_diff = abs(y_test - y_predsm)

# Count the number of predictions within the threshold of 0.25
within_threshold = abs_diff[abs_diff < 0.25].count()

# Calculate the PRED metric
PRED = (within_threshold / len(y_predsm)) * 100

print(f"PRED: {PRED}%")


PRED: 0.0%


# RandomForestRegression

In [30]:
from sklearn.ensemble import RandomForestRegressor
RFregressor=RandomForestRegressor()
RFregressor.fit(x_train,y_train)
RFregressor.score(x_test,y_test)

0.3425124907059992

In [31]:
y_predrf=RFregressor.predict(x_test)
y_predrf,y_test

(array([ 51.079 , 191.8724, 167.0002]),
 9      72.0
 11    230.7
 0     287.0
 Name: EffortMM, dtype: float64)

In [32]:
from sklearn import metrics
from sklearn.metrics import r2_score
print("pred:",(np.abs(y_test - y_predrf) <= 2 * (metrics.mean_absolute_error(y_test,y_predrf))).mean())
print("r2",r2_score(y_test, y_predrf, multioutput='variance_weighted'))
print("maerf:",metrics.mean_absolute_error(y_test,y_predrf))
print("mserf:",metrics.mean_squared_error(y_test,y_predrf))
print("rmserf:",np.sqrt(metrics.mean_squared_error(y_test,y_predrf)))

pred: 0.6666666666666666
r2 0.34251249070599926
maerf: 59.9161333333334
mserf: 5448.407587600012
rmserf: 73.81332933556115


In [45]:
# Calculate the absolute difference between predicted and actual values
abs_diff = abs(y_test - y_predrf)

# Count the number of predictions within the threshold of 0.25
within_threshold = abs_diff[abs_diff < 0.25].count()

# Calculate the PRED metric
PRED = (within_threshold / len(y_predrf)) * 100

print(f"PRED: {PRED}%")


PRED: 0.0%


# Ridge

In [33]:
from sklearn.linear_model import Ridge
ridge=Ridge()
ridge.fit(x_train,y_train)
ridge.score(x_test,y_test)

-2.8350533572646244

In [34]:
y_predr=ridge.predict(x_test)
y_predr,y_test

(array([-193.04333607,  159.55762856,  145.47012604]),
 9      72.0
 11    230.7
 0     287.0
 Name: EffortMM, dtype: float64)

In [35]:
from sklearn import metrics
from sklearn.metrics import r2_score
print("pred:",(np.abs(y_test - y_predr) <= 2 * (metrics.mean_absolute_error(y_test,y_predr))).mean())
print("r2",r2_score(y_test, y_predr, multioutput='variance_weighted'))
print("maer:",metrics.mean_absolute_error(y_test,y_predr))
print("mser:",metrics.mean_squared_error(y_test,y_predr))
print("rmser:",np.sqrt(metrics.mean_squared_error(y_test,y_predr)))

pred: 1.0
r2 -2.8350533572646244
maer: 159.23852715881148
mser: 31779.97074500794
rmser: 178.2693769131646


In [46]:
# Calculate the absolute difference between predicted and actual values
abs_diff = abs(y_test - y_predr)

# Count the number of predictions within the threshold of 0.25
within_threshold = abs_diff[abs_diff < 0.25].count()

# Calculate the PRED metric
PRED = (within_threshold / len(y_predr)) * 100

print(f"PRED: {PRED}%")


PRED: 0.0%


# SVR

In [36]:
from sklearn.svm import LinearSVR
svr=LinearSVR(max_iter=1000)
svr.fit(x_train,y_train)
svr.score(x_test,y_test)

-4.110782085047033

In [37]:
y_predsvr=svr.predict(x_test)
y_predsvr,y_test

(array([12., 12., 12.]),
 9      72.0
 11    230.7
 0     287.0
 Name: EffortMM, dtype: float64)

In [38]:
from sklearn import metrics
from sklearn.metrics import r2_score
print("pred:",(np.abs(y_test - y_predsvr) <= 2 * (metrics.mean_absolute_error(y_test,y_predsvr))).mean())
print("r2",r2_score(y_test, y_predsvr, multioutput='variance_weighted'))
print("maerf:",metrics.mean_absolute_error(y_test,y_predsvr))
print("mserf:",metrics.mean_squared_error(y_test,y_predsvr))
print("rmserf:",np.sqrt(metrics.mean_squared_error(y_test,y_predsvr)))

pred: 1.0
r2 -4.110782085047033
maerf: 184.5666666666667
mserf: 42351.56333333333
rmserf: 205.79495458667915


In [47]:
# Calculate the absolute difference between predicted and actual values
abs_diff = abs(y_test - y_predsvr)

# Count the number of predictions within the threshold of 0.25
within_threshold = abs_diff[abs_diff < 0.25].count()

# Calculate the PRED metric
PRED = (within_threshold / len(y_predsvr)) * 100

print(f"PRED: {PRED}%")


PRED: 0.0%
