In [8]:
import arff, numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import KFold,cross_val_score
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV



In [9]:
dataset = arff.load(open('cocomo811.arff'))
data=np.array(dataset['data'])

In [10]:
data.shape

(63, 17)

In [12]:
X=data[:,0:16]
Y=data[:,16:17]

In [16]:
# Convert to pandas DataFrames
X_df = pd.DataFrame(X, columns=[f'Feature_{i+1}' for i in range(X.shape[1])])
Y_df = pd.DataFrame(Y, columns=['actual'])

X_df.head(), Y_df.head()

(   Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
 0       0.88       1.16       0.70        1.0       1.06       1.15   
 1       0.88       1.16       0.85        1.0       1.06       1.00   
 2       1.00       1.16       0.85        1.0       1.00       0.87   
 3       0.75       1.16       0.70        1.0       1.00       0.87   
 4       0.88       0.94       1.00        1.0       1.00       0.87   
 
    Feature_7  Feature_8  Feature_9  Feature_10  Feature_11  Feature_12  \
 0       1.07       1.19       1.13        1.17         1.1        1.00   
 1       1.07       1.00       0.91        1.00         0.9        0.95   
 2       0.94       0.86       0.82        0.86         0.9        0.95   
 3       1.00       1.19       0.91        1.42         1.0        0.95   
 4       1.00       1.00       1.00        0.86         0.9        0.95   
 
    Feature_13  Feature_14  Feature_15  Feature_16  
 0        1.24        1.10        1.04       113.0  
 1      

In [13]:

print(Y.shape)
# Y=Y.reshape(63)
# print(Y.shape)

(63, 1)


In [14]:
X.shape

(63, 16)

In [15]:
# Function to detect outliers using IQR for the entire dataframe
def detect_outliers_iqr(df):
    outliers_dict = {}
    for column in df.select_dtypes(include=[np.number]).columns:  # Only numeric columns
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
        
        # Store outliers for each column
        if not outliers.empty:
            outliers_dict[column] = outliers
    return outliers_dict

# Detecting outliers for all numeric columns
outliers_dict = detect_outliers_iqr(X)

# Display outliers for each column
for column, outliers in outliers_dict.items():
    print(f"Outliers in '{column}':")
    print(outliers[['Project', column]])  # Display only the project and the outlier column
    print("\n")

AttributeError: 'numpy.ndarray' object has no attribute 'select_dtypes'

# Cocomo n_estimator=100 and max_features= 1 to 16

In [7]:

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.30, random_state=42)

In [8]:
lab_enc = preprocessing.LabelEncoder()
training_scores_encoded = lab_enc.fit_transform(y_train)

In [9]:
n_feature=7
MMRE_list=[]
MdMRE_list=[]
Pred_list= []
for i in np.arange(1,11):   
    clf= RandomForestRegressor(n_estimators=100,max_features=i)
    clf.fit(X_train, training_scores_encoded)
    Y_pred=clf.predict(X_test)
    diff=np.absolute(y_test-Y_pred)
    MRE=diff/y_test
    MMRE=np.mean(MRE)
    MdMRE=np.median(MRE)
    P=MRE[MRE<.25]
    Pred=(P.size/MRE.size) * 100
    MMRE_list.append(MMRE)
    MdMRE_list.append(MdMRE)
    Pred_list.append(Pred)

In [None]:
MMRE_list

In [None]:
MdMRE_list

In [None]:
Pred_list

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = np.arange(1,11)

plt.plot(x, MMRE_list)
plt.plot(x, MdMRE_list)

plt.legend(['MMRE','MdMRE'], loc='upper right')

plt.show()

In [None]:
plt.plot(x, Pred_list)
plt.legend(['Pred'], loc='upper right')
plt.show()

# COCOMO n_estimator=100 to 2000 and max_features= 7


In [15]:
MMRE_list=[]
MdMRE_list=[]
Pred_list= []
for i in np.arange(100,1100,100):   
    clf= RandomForestRegressor(n_estimators=i,max_features=7)
    clf.fit(X_train, training_scores_encoded)
    Y_pred=clf.predict(X_test)
    diff=np.absolute(y_test-Y_pred)
    MRE=diff/y_test
    MMRE=np.mean(MRE)
    MdMRE=np.median(MRE)
    P=MRE[MRE<.25]
    Pred=(P.size/MRE.size) * 100
    MMRE_list.append(MMRE)
    MdMRE_list.append(MdMRE)
    Pred_list.append(Pred)

In [None]:
MMRE_list

In [None]:
MdMRE_list

In [None]:
Pred_list

In [None]:
x = np.arange(100,1100,100)
plt.plot(x, MMRE_list)
plt.plot(x, MdMRE_list)

plt.legend(['MMRE','MdMRE'], loc='upper right')

plt.show()

In [None]:
plt.plot(x, Pred_list)
plt.legend(['Pred'], loc='upper right')
plt.show()

# GridSearchCV

In [21]:
params_grd={
 'max_features': [1,2,3,4,5,6,7,8,9,10],
  'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
clf= RandomForestRegressor()
grd_search=GridSearchCV(estimator = clf, param_grid = params_grd, 
                          cv = 10, n_jobs = -1, verbose = 2)
grd_search.fit(X_train, training_scores_encoded)
best_grid = grd_search.best_estimator_

Y_pred=best_grid.predict(X_test)
diff=np.absolute(y_test-Y_pred)
MRE=diff/y_test
MMRE=np.mean(MRE)
MdMRE=np.median(MRE)
P=MRE[MRE<.25]
Pred=(P.size/MRE.size) * 100


In [None]:
print(MMRE)
print(MdMRE)
print(Pred)

In [None]:
grd_search.best_params_


# Random Hyperparameter Grid

In [25]:
Random_grid={'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': [1,2,3,4,5,6,7,8,9,10],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
clf= RandomForestRegressor()
random_grd_search=RandomizedSearchCV(estimator = clf, param_distributions = Random_grid, n_iter = 80, cv = 10, 
                              verbose=2, random_state=42, n_jobs = -1)
random_grd_search.fit(X_train, training_scores_encoded)
best_grid = random_grd_search.best_estimator_

Y_pred=best_grid.predict(X_test)
diff=np.absolute(y_test-Y_pred)
MRE=diff/y_test
MMRE=np.mean(MRE)
MdMRE=np.median(MRE)
P=MRE[MRE<.25]
Pred=(P.size/MRE.size) * 100


In [None]:
print(MMRE)
print(MdMRE)
print(Pred)

In [None]:
random_grd_search.best_params_

#  Regression Tree Analysis

In [29]:
clf= DecisionTreeRegressor(max_depth=30, min_samples_split=20)
clf.fit(X_train, training_scores_encoded)
Y_pred=clf.predict(X_test)
diff=np.absolute(y_test-Y_pred)
MRE=diff/y_test
MMRE=np.mean(MRE)
MdMRE=np.median(MRE)
P=MRE[MRE<.25]
Pred=(P.size/MRE.size) * 100

In [None]:
print(MMRE)
print(MdMRE)
print(Pred)