In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel('data.xlsx')

In [3]:
df.head()

Unnamed: 0,technology,Exp,Modules,Category,Project Name,Estimated Time
0,C#,2,User Management,Hotel Management System,Book My Hotel,10
1,C#,2,Room Booking Service,Hotel Management System,Book My Hotel,20
2,C#,2,Payment Gateway,Hotel Management System,Book My Hotel,5
3,C#,2,Cart,Hotel Management System,Book My Hotel,30
4,C#,1,User Management,Hotel Management System,Book My Hotel,15


In [4]:
X = df.iloc[:, 0:5]
X.head()

Unnamed: 0,technology,Exp,Modules,Category,Project Name
0,C#,2,User Management,Hotel Management System,Book My Hotel
1,C#,2,Room Booking Service,Hotel Management System,Book My Hotel
2,C#,2,Payment Gateway,Hotel Management System,Book My Hotel
3,C#,2,Cart,Hotel Management System,Book My Hotel
4,C#,1,User Management,Hotel Management System,Book My Hotel


In [5]:
y = df.iloc[:,5]

In [6]:
y.head()

0    10
1    20
2     5
3    30
4    15
Name: Estimated Time, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
])

categorical_cols = [0,2,3,4]

# Bundle preprocessing for categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('scl', numerical_transformer, [1]),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel


model = RandomForestRegressor(n_estimators=100, random_state=0)
# model = LinearRegression()
# model = KNeighborsRegressor(n_neighbors=3)
# model = DecisionTreeRegressor()
# model = SVR(epsilon=0.2)
# kernel = DotProduct() + WhiteKernel()
# model = GaussianProcessRegressor(kernel=kernel)

In [11]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,explained_variance_score

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, preds)
print('MAE:', mae)
mse = mean_squared_error(y_test, preds)
print('MSE:', mse)
rmse = np.sqrt(mse)
print('RMSE:', rmse)
print("Explain variance score =", round(explained_variance_score(y_test, preds), 2)) 

MAE: 6.226999999999999
MSE: 60.44264999999999
RMSE: 7.77448712134762
Explain variance score = 0.82


In [12]:
y_test

29    17
15    50
24    18
17    18
8     20
9     15
30    70
25    15
12    35
0     10
Name: Estimated Time, dtype: int64

In [13]:
preds

array([31.02, 35.8 , 22.69, 19.91, 21.96, 18.54, 64.15, 18.05, 24.26,
       12.31])

In [14]:
my_preds = my_pipeline.predict([['C#',1,'User','Insurance','Inventory Management'],['C#',2,'User','Insurance','Inventory Management']])



In [15]:
my_preds

array([35.2 , 26.09])

In [16]:
import pickle

print('Model score:', my_pipeline.score(X_train,y_train))
pickle.dump(my_pipeline, open('rfr_model.sav', 'wb'))

Model score: 0.9277413483209376


In [17]:
loaded_model = pickle.load(open('rfr_model.sav', 'rb'))                 
print('Loaded model score:', loaded_model.score(X_train,y_train))

Loaded model score: 0.9277413483209376


In [18]:
loaded_model.predict([['C#',1,'User','Insurance','Inventory Management'],['C#',2,'User','Insurance','Inventory Management']])



array([35.2 , 26.09])