In [5]:
import numpy as np
import pandas as pd

columns = ['Project', 'TeamExp', 'ManagerExp', 'YearEnd', 'Length', 'Effort', 'Transactions', 'Entities', 'PointsAdjust', 'Envergure', 'PointsNonAjust', 'Language']
df=pd.read_csv('./desharnais.txt', names=columns, comment='%', skipinitialspace=True, delimiter=',')
df.isna().sum()

print(df['TeamExp'].unique())
print(df['ManagerExp'].unique())

df['TeamExp'] = df['TeamExp'].replace('?', np.nan)
df['TeamExp'] = pd.to_numeric(df['TeamExp'], errors='coerce')

# Step 3: Replace NaN with the median (or mean if you prefer)
df['TeamExp'].fillna(df['TeamExp'].mean(), inplace=True)
df['ManagerExp'] = df['ManagerExp'].replace('?', np.nan)
df['ManagerExp'] = pd.to_numeric(df['ManagerExp'], errors='coerce')

# Step 3: Replace NaN with the median (or mean if you prefer)
df['ManagerExp'].fillna(df['ManagerExp'].mean(), inplace=True)

['1' '0' '4' '2' '3' '?']
['4' '0' '1' '2' '3' '?' '7' '5']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TeamExp'].fillna(df['TeamExp'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ManagerExp'].fillna(df['ManagerExp'].mean(), inplace=True)


In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Train-test split
X = df[['TeamExp', 'ManagerExp', 'Length', 'Transactions', 'Entities', 'PointsAdjust', 'Envergure', 'Language']]
y = df['Effort']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize individual models
models = {
    'Linear Regression': LinearRegression(),
    'Support Vector Regression': SVR(kernel='linear'),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'KNN': KNeighborsRegressor()
}

# Adding ensemble model - Voting Regressor (which averages the predictions of the models)
ensemble_model = VotingRegressor(estimators=[
    ('lr', LinearRegression()),
    ('svr', SVR(kernel='linear')),
    ('dt', DecisionTreeRegressor()),
    ('rf', RandomForestRegressor()),
    ('gb', GradientBoostingRegressor()),
    ('knn', KNeighborsRegressor())
])

# Include the Voting Regressor in models dictionary
models['Ensemble (Voting Regressor)'] = ensemble_model

# Train models and evaluate performance
results = {}

for model_name, model in models.items():
    # Perform 10-fold cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
    mean_cv_score = -np.mean(cv_scores)
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    correlation_coefficient = np.corrcoef(y_test, y_pred)[0, 1]
    mae = mean_absolute_error(y_test, y_pred)
    rae = np.sum(np.abs(y_test - y_pred)) / np.sum(np.abs(y_test - np.mean(y_test)))
    rrse = np.sqrt(np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2))
    
    # Store results
    results[model_name] = {
        'Mean CV MSE': mean_cv_score,
        'RMSE': rmse,
        'R^2': r2,
        'Correlation Coefficient': correlation_coefficient,
        'MAE': mae,
        'RAE': rae * 100,  # Convert to percentage
        'RRSE': rrse * 100  # Convert to percentage
    }

# Print results
for model_name, metrics in results.items():
    print(f"{model_name}:")
    print(f"  Mean CV MSE: {metrics['Mean CV MSE']:.2f}")
    print(f"  RMSE: {metrics['RMSE']:.2f}")
    print(f"  R^2: {metrics['R^2']:.2f}")
    print(f"  Correlation Coefficient: {metrics['Correlation Coefficient']:.4f}")
    print(f"  MAE: {metrics['MAE']:.2f}")
    print(f"  Relative Absolute Error (RAE): {metrics['RAE']:.2f} %")
    print(f"  Root Relative Squared Error (RRSE): {metrics['RRSE']:.2f} %\n")


Linear Regression:
  Mean CV MSE: 11012174.35
  RMSE: 1920.31
  R^2: 0.71
  Correlation Coefficient: 0.8595
  MAE: 1596.61
  Relative Absolute Error (RAE): 59.59 %
  Root Relative Squared Error (RRSE): 53.76 %

Support Vector Regression:
  Mean CV MSE: 13918998.86
  RMSE: 2541.80
  R^2: 0.49
  Correlation Coefficient: 0.7800
  MAE: 1897.43
  Relative Absolute Error (RAE): 70.82 %
  Root Relative Squared Error (RRSE): 71.16 %

Decision Tree:
  Mean CV MSE: 26648233.00
  RMSE: 4354.70
  R^2: -0.49
  Correlation Coefficient: 0.3500
  MAE: 3128.82
  Relative Absolute Error (RAE): 116.78 %
  Root Relative Squared Error (RRSE): 121.91 %

Random Forest:
  Mean CV MSE: 14265878.73
  RMSE: 2429.94
  R^2: 0.54
  Correlation Coefficient: 0.7751
  MAE: 1857.85
  Relative Absolute Error (RAE): 69.35 %
  Root Relative Squared Error (RRSE): 68.03 %

Gradient Boosting:
  Mean CV MSE: 13593381.43
  RMSE: 1943.13
  R^2: 0.70
  Correlation Coefficient: 0.8501
  MAE: 1673.91
  Relative Absolute Error (RAE