In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

In [None]:
# Load dataset
columns = ['Project', 'TeamExp', 'ManagerExp', 'YearEnd', 'Length', 'Effort', 'Transactions', 'Entities', 'PointsAdjust', 'Envergure', 'PointsNonAjust', 'Language']
df=pd.read_csv('./desharnais.txt', names=columns, comment='%', na_values='?', skipinitialspace=True, delimiter=',')

df.head()

In [None]:
# Missing values and data types
df.info()
df.isnull().sum()

In [None]:
#Basic stats for dataset
#mean
# percentile for p=0.25 & 0.75
# third quartile/median
df.describe()

In [None]:
# Duplicate rows
df.duplicated().sum()

In [None]:
# Adequacy of data using heatmap
sns.heatmap(df.corr())

In [None]:
# Checking for class imbalance
for col in columns[1:]:
    df[col].plot(kind='hist', bins=20, title=f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()


In [None]:
# Plotting boxplots for each numerical column
for col in columns:
    plt.figure(figsize=(15,15))
    sns.boxplot(y=df[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
    plt.show()

In [None]:
from scipy.stats import trim_mean

trim_frac=0.10
for col in columns[1:]:
    print(f'Trimmed Mean for {col} : {trim_mean(df[col], proportiontocut=trim_frac)}')

In [None]:
trim_frac = 0.1

trimmed_df = pd.DataFrame()

for col in df.columns:
    sorted_data = np.sort(df[col])
    
    lower_idx = int(len(sorted_data) * trim_frac)
    upper_idx = int(len(sorted_data) * (1 - trim_frac))
    
    trimmed_data = sorted_data[lower_idx:upper_idx]
    
    trimmed_column = np.full_like(df[col], np.nan)
    trimmed_column[lower_idx:upper_idx] = trimmed_data
    
    trimmed_df[col] = trimmed_column

print(trimmed_df)


In [32]:
trimmed_median_dict = {}
trimmed_std_dict = {}

for col in df.columns:
    sorted_data = np.sort(df[col])
    
    lower_idx = int(len(sorted_data) * trim_frac)
    upper_idx = int(len(sorted_data) * (1 - trim_frac))
    
    trimmed_data = sorted_data[lower_idx:upper_idx]
    
    trimmed_median = np.median(trimmed_data)
    trimmed_median_dict[col] = trimmed_median
    
    trimmed_std = np.std(trimmed_data, ddof=1) 
    trimmed_std_dict[col] = trimmed_std

trimmed_median_df = pd.DataFrame(list(trimmed_median_dict.items()), columns=['Column', 'Trimmed Median'])
trimmed_std_df = pd.DataFrame(list(trimmed_std_dict.items()), columns=['Column', 'Trimmed Std Deviation'])

print("Trimmed Medians:")
print(trimmed_median_df[1:])

print("\nTrimmed Standard Deviations:")
print(trimmed_std_df[1:])


Trimmed Medians:
            Column  Trimmed Median
1          TeamExp             2.0
2       ManagerExp             3.0
3          YearEnd            86.0
4           Length            10.0
5           Effort          3636.5
6     Transactions           138.0
7         Entities            98.0
8     PointsAdjust           259.5
9        Envergure            28.0
10  PointsNonAjust           250.0
11        Language             1.0

Trimmed Standard Deviations:
            Column  Trimmed Std Deviation
1          TeamExp               1.122351
2       ManagerExp               1.166173
3          YearEnd               0.753300
4           Length               3.887014
5           Effort            2155.469348
6     Transactions              67.900655
7         Entities              50.897342
8     PointsAdjust              99.165041
9        Envergure               7.256343
10  PointsNonAjust             105.451241
11        Language               0.528691


COCOMO81


In [34]:
import arff

with open('./cocomo811.arff') as f:
    dataset=arff.load(f)

df = pd.DataFrame(dataset['data'], columns=[attr[0] for attr in dataset['attributes']])

df.head()

Unnamed: 0,rely,data,cplx,time,stor,virt,turn,acap,aexp,pcap,vexp,lexp,modp,tool,sced,loc,actual
0,0.88,1.16,0.7,1.0,1.06,1.15,1.07,1.19,1.13,1.17,1.1,1.0,1.24,1.1,1.04,113.0,2040.0
1,0.88,1.16,0.85,1.0,1.06,1.0,1.07,1.0,0.91,1.0,0.9,0.95,1.1,1.0,1.0,293.0,1600.0
2,1.0,1.16,0.85,1.0,1.0,0.87,0.94,0.86,0.82,0.86,0.9,0.95,0.91,0.91,1.0,132.0,243.0
3,0.75,1.16,0.7,1.0,1.0,0.87,1.0,1.19,0.91,1.42,1.0,0.95,1.24,1.0,1.04,60.0,240.0
4,0.88,0.94,1.0,1.0,1.0,0.87,1.0,1.0,1.0,0.86,0.9,0.95,1.24,1.0,1.0,16.0,33.0
