In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,Technology,Experience,Difficulty,Module,ETAHours
0,Python,1,1,UserModule,3
1,Python,2,1,UserModule,4
2,Python,3,1,UserModule,3
3,Python,4,1,UserModule,1
4,Python,5,1,UserModule,2


In [4]:
X = df.iloc[:, 0:4]
X.head()

Unnamed: 0,Technology,Experience,Difficulty,Module
0,Python,1,1,UserModule
1,Python,2,1,UserModule
2,Python,3,1,UserModule
3,Python,4,1,UserModule
4,Python,5,1,UserModule


In [5]:
y = df.iloc[:,4]

In [6]:
y.head()

0    3
1    4
2    3
3    1
4    2
Name: ETAHours, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
])

categorical_cols = [0,3]

# Bundle preprocessing for categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('scl', numerical_transformer, [1,2]),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel


model = RandomForestRegressor(random_state=0)
# model = LinearRegression()
# model = KNeighborsRegressor(n_neighbors=3)
# model = DecisionTreeRegressor()
# model = SVR(epsilon=0.2)
# kernel = DotProduct() + WhiteKernel()
# model = GaussianProcessRegressor(kernel=kernel)

In [11]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,explained_variance_score

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, preds)
print('MAE:', mae)
mse = mean_squared_error(y_test, preds)
print('MSE:', mse)
rmse = np.sqrt(mse)
print('RMSE:', rmse)
print("Explain variance score =", round(explained_variance_score(y_test, preds), 2)) 

MAE: 1.0693866666666667
MSE: 1.9411829333333335
RMSE: 1.3932634113236928
Explain variance score = 0.9


In [12]:
y_test

680     9
1102    7
394     9
930     8
497     8
       ..
66      6
67      4
376     6
211     9
12      3
Name: ETAHours, Length: 375, dtype: int64

In [13]:
preds

array([ 8.63,  6.51,  8.3 ,  8.57,  9.95, 10.85, 11.36, 15.45,  5.51,
        2.62,  9.04, 17.97,  1.54, 11.07,  2.69,  3.43, 10.46,  9.45,
        5.23,  2.43,  3.45,  8.46,  2.82,  1.83,  5.2 ,  2.89, 16.89,
        2.61, 10.7 ,  5.13,  4.92,  5.33,  6.07,  8.81,  8.56,  5.46,
        5.58,  3.45,  5.29,  3.72,  2.13,  4.35,  2.71,  5.6 ,  2.25,
        5.59,  8.68,  4.94, 10.46,  2.74,  3.56, 17.94,  5.52,  3.8 ,
       16.6 ,  5.81,  9.6 ,  2.75, 17.03,  2.62,  6.42,  3.34,  4.14,
        2.06, 11.57,  2.76,  5.08,  5.13,  2.98,  9.28,  5.26,  1.41,
        4.82,  2.39,  3.48,  8.24,  3.46, 10.44,  5.57,  4.53,  1.51,
        2.7 ,  5.5 ,  3.46,  4.98,  5.45,  5.63,  8.8 , 12.01,  4.05,
        6.21, 10.89,  9.  ,  4.95, 11.82,  6.47,  6.28, 17.96, 10.29,
        3.15, 15.33,  5.61,  5.25,  2.47,  3.04,  3.48,  3.41,  3.64,
        8.71, 15.2 ,  5.29,  2.95,  2.86, 17.33, 15.79, 16.93,  8.26,
       10.68, 16.73, 16.53,  3.87,  2.55, 10.63,  2.22,  2.7 ,  3.53,
        4.67,  5.1 ,

In [14]:
my_preds = my_pipeline.predict([['C#',6,3,'UserModule'],['php',9,5,'RoleModule']])



In [15]:
my_preds

array([3.53, 5.58])

In [16]:
import pickle

print('Model score:', my_pipeline.score(X_train,y_train))
pickle.dump(my_pipeline, open('rfr_model.sav', 'wb'))

Model score: 0.9860374134161146


In [17]:
loaded_model = pickle.load(open('rfr_model.sav', 'rb'))                 
print('Loaded model score:', loaded_model.score(X_train,y_train))

Loaded model score: 0.9860374134161146


In [18]:
loaded_model.predict([['React',15,4,'UserModule'],['C#',15,4,'UserModule']])



array([1.64, 1.73])