# Simple Arithmetic Project
**Objective:** This project is aimed at training a model that will perform simple arithmetic operations.


**Evaluation:** The model should get a r2_score of 0.85 and have a mean absolute error of less than 10.

## Importing libraries 

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

## Importing data and performing Exploratory Data Analysis

In [2]:
data = pd.read_csv("./data/Larger_dataset.csv")
data.head()

Unnamed: 0,First_figure,Operator,Second_figure,Result
0,20,+,8,28.0
1,10,/,12,0.833333
2,20,+,10,30.0
3,6,+,17,23.0
4,17,-,14,3.0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   First_figure   800 non-null    int64  
 1   Operator       800 non-null    object 
 2   Second_figure  800 non-null    int64  
 3   Result         800 non-null    float64
dtypes: float64(1), int64(2), object(1)
memory usage: 25.1+ KB


In [4]:
data.duplicated().sum()

128

In [5]:
data.drop_duplicates(inplace=True)
len(data)

672

In [6]:
data.Operator.value_counts()

+    175
-    174
*    163
/    160
Name: Operator, dtype: int64

In [10]:
for label, content in data.items():
    if pd.api.types.is_string_dtype(content):
        data[label] = content.astype("category").cat.as_ordered()
        data[label] = pd.Categorical(content).codes + 1

In [11]:
data.head()

Unnamed: 0,First_figure,Operator,Second_figure,Result
0,20,2,8,28.0
1,10,4,12,0.833333
2,20,2,10,30.0
3,6,2,17,23.0
4,17,3,14,3.0


In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [27]:
np.random.seed(42)
X = data.drop("Result", axis=1)
y = data.Result
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size=0.2)

model = RandomForestRegressor().fit(X_train, y_train)


In [28]:
model.score(X_train, y_train)

0.9986969573507096

In [29]:
model.score(X_test, y_test)

0.9956514649002176

In [16]:
y_preds = model.predict(X_test)

In [18]:
mae = mean_absolute_error(y_test, y_preds)
mae

3.070022558494264

In [19]:
r2_score(y_test, y_preds)

0.9947260646275384

In [21]:
grid = {
    "max_depth": [None, 5],
    "n_estimators": [10, 100, 1000],
    "max_features": ["auto", "sqrt"],
    "min_samples_leaf": [1, 2, 4],
    "n_jobs": [None, -1]
}

gs_model = GridSearchCV(model, grid, verbose=2, cv=5)
gs_model.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=None 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=None, total=   0.1s
[CV] max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=None 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=None, total=   0.0s
[CV] max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=None 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=None, total=   0.0s
[CV] max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=None 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=None, total=   0.0s

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s



[CV] max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=None 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=None, total=   0.0s
[CV] max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=-1 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=-1, total=   2.2s
[CV] max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=-1 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=-1, total=   0.2s
[CV] max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=-1 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=-1, total=   0.2s
[CV] max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=-1 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, n_estimators=10, n_jobs=-1, total=   0.2s
[CV] max_depth=None, max_features=auto, 

[CV]  max_depth=None, max_features=auto, min_samples_leaf=2, n_estimators=100, n_jobs=-1, total=   0.5s
[CV] max_depth=None, max_features=auto, min_samples_leaf=2, n_estimators=100, n_jobs=-1 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=2, n_estimators=100, n_jobs=-1, total=   0.5s
[CV] max_depth=None, max_features=auto, min_samples_leaf=2, n_estimators=100, n_jobs=-1 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=2, n_estimators=100, n_jobs=-1, total=   0.5s
[CV] max_depth=None, max_features=auto, min_samples_leaf=2, n_estimators=100, n_jobs=-1 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=2, n_estimators=100, n_jobs=-1, total=   0.5s
[CV] max_depth=None, max_features=auto, min_samples_leaf=2, n_estimators=1000, n_jobs=None 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=2, n_estimators=1000, n_jobs=None, total=   3.3s
[CV] max_depth=None, max_features=auto, min_samples_leaf=2, n_estimators=1000, n_jobs=None 
[CV]  max_depth=None, max_

[CV]  max_depth=None, max_features=auto, min_samples_leaf=4, n_estimators=1000, n_jobs=-1, total=   2.5s
[CV] max_depth=None, max_features=auto, min_samples_leaf=4, n_estimators=1000, n_jobs=-1 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=4, n_estimators=1000, n_jobs=-1, total=   2.6s
[CV] max_depth=None, max_features=sqrt, min_samples_leaf=1, n_estimators=10, n_jobs=None 
[CV]  max_depth=None, max_features=sqrt, min_samples_leaf=1, n_estimators=10, n_jobs=None, total=   0.1s
[CV] max_depth=None, max_features=sqrt, min_samples_leaf=1, n_estimators=10, n_jobs=None 
[CV]  max_depth=None, max_features=sqrt, min_samples_leaf=1, n_estimators=10, n_jobs=None, total=   0.1s
[CV] max_depth=None, max_features=sqrt, min_samples_leaf=1, n_estimators=10, n_jobs=None 
[CV]  max_depth=None, max_features=sqrt, min_samples_leaf=1, n_estimators=10, n_jobs=None, total=   0.0s
[CV] max_depth=None, max_features=sqrt, min_samples_leaf=1, n_estimators=10, n_jobs=None 
[CV]  max_depth=None, max

[CV]  max_depth=None, max_features=sqrt, min_samples_leaf=2, n_estimators=100, n_jobs=None, total=   0.3s
[CV] max_depth=None, max_features=sqrt, min_samples_leaf=2, n_estimators=100, n_jobs=None 
[CV]  max_depth=None, max_features=sqrt, min_samples_leaf=2, n_estimators=100, n_jobs=None, total=   0.3s
[CV] max_depth=None, max_features=sqrt, min_samples_leaf=2, n_estimators=100, n_jobs=None 
[CV]  max_depth=None, max_features=sqrt, min_samples_leaf=2, n_estimators=100, n_jobs=None, total=   0.3s
[CV] max_depth=None, max_features=sqrt, min_samples_leaf=2, n_estimators=100, n_jobs=None 
[CV]  max_depth=None, max_features=sqrt, min_samples_leaf=2, n_estimators=100, n_jobs=None, total=   0.3s
[CV] max_depth=None, max_features=sqrt, min_samples_leaf=2, n_estimators=100, n_jobs=-1 
[CV]  max_depth=None, max_features=sqrt, min_samples_leaf=2, n_estimators=100, n_jobs=-1, total=   0.5s
[CV] max_depth=None, max_features=sqrt, min_samples_leaf=2, n_estimators=100, n_jobs=-1 
[CV]  max_depth=None,

[CV]  max_depth=None, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=None, total=   2.8s
[CV] max_depth=None, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=None 
[CV]  max_depth=None, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=None, total=   2.9s
[CV] max_depth=None, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=-1 
[CV]  max_depth=None, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=-1, total=   2.5s
[CV] max_depth=None, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=-1 
[CV]  max_depth=None, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=-1, total=   2.6s
[CV] max_depth=None, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=-1 
[CV]  max_depth=None, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=-1, total=   2.5s
[CV] max_depth=None, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=-1 
[CV]  max_depth=Non

[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, n_estimators=10, n_jobs=-1, total=   0.2s
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, n_estimators=10, n_jobs=-1 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, n_estimators=10, n_jobs=-1, total=   0.2s
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, n_estimators=10, n_jobs=-1 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, n_estimators=10, n_jobs=-1, total=   0.2s
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, n_estimators=10, n_jobs=-1 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, n_estimators=10, n_jobs=-1, total=   0.2s
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, n_estimators=100, n_jobs=None 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, n_estimators=100, n_jobs=None, total=   0.3s
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, n_estimators=100, n_jobs=None 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, n_estima

[CV]  max_depth=5, max_features=auto, min_samples_leaf=4, n_estimators=1000, n_jobs=None, total=   2.5s
[CV] max_depth=5, max_features=auto, min_samples_leaf=4, n_estimators=1000, n_jobs=None 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=4, n_estimators=1000, n_jobs=None, total=   2.8s
[CV] max_depth=5, max_features=auto, min_samples_leaf=4, n_estimators=1000, n_jobs=None 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=4, n_estimators=1000, n_jobs=None, total=   2.9s
[CV] max_depth=5, max_features=auto, min_samples_leaf=4, n_estimators=1000, n_jobs=None 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=4, n_estimators=1000, n_jobs=None, total=   2.8s
[CV] max_depth=5, max_features=auto, min_samples_leaf=4, n_estimators=1000, n_jobs=None 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=4, n_estimators=1000, n_jobs=None, total=   2.8s
[CV] max_depth=5, max_features=auto, min_samples_leaf=4, n_estimators=1000, n_jobs=-1 
[CV]  max_depth=5, max_features=auto,

[CV]  max_depth=5, max_features=sqrt, min_samples_leaf=2, n_estimators=10, n_jobs=-1, total=   0.1s
[CV] max_depth=5, max_features=sqrt, min_samples_leaf=2, n_estimators=10, n_jobs=-1 
[CV]  max_depth=5, max_features=sqrt, min_samples_leaf=2, n_estimators=10, n_jobs=-1, total=   0.1s
[CV] max_depth=5, max_features=sqrt, min_samples_leaf=2, n_estimators=10, n_jobs=-1 
[CV]  max_depth=5, max_features=sqrt, min_samples_leaf=2, n_estimators=10, n_jobs=-1, total=   0.1s
[CV] max_depth=5, max_features=sqrt, min_samples_leaf=2, n_estimators=10, n_jobs=-1 
[CV]  max_depth=5, max_features=sqrt, min_samples_leaf=2, n_estimators=10, n_jobs=-1, total=   0.1s
[CV] max_depth=5, max_features=sqrt, min_samples_leaf=2, n_estimators=100, n_jobs=None 
[CV]  max_depth=5, max_features=sqrt, min_samples_leaf=2, n_estimators=100, n_jobs=None, total=   0.1s
[CV] max_depth=5, max_features=sqrt, min_samples_leaf=2, n_estimators=100, n_jobs=None 
[CV]  max_depth=5, max_features=sqrt, min_samples_leaf=2, n_estima

[CV]  max_depth=5, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=None, total=   2.7s
[CV] max_depth=5, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=None 
[CV]  max_depth=5, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=None, total=   2.8s
[CV] max_depth=5, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=None 
[CV]  max_depth=5, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=None, total=   3.0s
[CV] max_depth=5, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=None 
[CV]  max_depth=5, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=None, total=   3.1s
[CV] max_depth=5, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=None 
[CV]  max_depth=5, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=None, total=   2.9s
[CV] max_depth=5, max_features=sqrt, min_samples_leaf=4, n_estimators=1000, n_jobs=-1 
[CV]  max_depth=5, max_features=sqrt,

[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:  6.7min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [22]:
gs_model.score(X_train, y_train)

0.999189674680681

In [23]:
gs_model.score(X_test, y_test)

0.9951505695295906

In [24]:
y_preds_gs = gs_model.predict(X_test)

In [25]:
mean_absolute_error(y_test, y_preds_gs)

2.9456616290002593

In [26]:
gs_model.best_params_

{'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'n_estimators': 100,
 'n_jobs': None}

## Saving the model

In [31]:
import pickle
pickle.dump(gs_model, open("simple_arithmetic_model.pkl", "wb"))