### Loading & Importing Libraries

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns

### Loading the dataset 

In [16]:
df = pd.read_json("mlData.json")

df.head()

Unnamed: 0,timeOfDay,location,temperature,humidity,lightIntensity
0,11:15,on_campus,41.433333,44.805882,1198
1,11:16,on_campus,41.611111,44.8,1198
2,11:17,on_campus,41.642857,44.857143,1198
3,11:18,on_campus,41.72,44.666667,1198
4,11:19,on_campus,41.866667,44.394118,1198


### Data Cleaning & Preparation

In [17]:
df.dropna(inplace=True)

Visualizing Data Xtics

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   timeOfDay       42 non-null     object 
 1   location        42 non-null     object 
 2   temperature     42 non-null     float64
 3   humidity        42 non-null     float64
 4   lightIntensity  42 non-null     int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 1.8+ KB


In [19]:
df_original = df.copy()

In [20]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()

# Fit and transform the 'timeOfDay' and 'location' columns
df['timeOfDay'] = le.fit_transform(df['timeOfDay'])
df['location'] = le.fit_transform(df['location'])

# Visualize df
df.head()

Unnamed: 0,timeOfDay,location,temperature,humidity,lightIntensity
0,0,0,41.433333,44.805882,1198
1,1,0,41.611111,44.8,1198
2,2,0,41.642857,44.857143,1198
3,3,0,41.72,44.666667,1198
4,4,0,41.866667,44.394118,1198


In [21]:
df.corr()

Unnamed: 0,timeOfDay,location,temperature,humidity,lightIntensity
timeOfDay,1.0,,0.800803,0.59909,
location,,,,,
temperature,0.800803,,1.0,0.930169,
humidity,0.59909,,0.930169,1.0,
lightIntensity,,,,,


Grab Features and Labels 

In [22]:
# Features
X = df.drop('temperature', axis=1)

# Labels
y = df['temperature']

X, y

(    timeOfDay  location   humidity  lightIntensity
 0           0         0  44.805882            1198
 1           1         0  44.800000            1198
 2           2         0  44.857143            1198
 3           3         0  44.666667            1198
 4           4         0  44.394118            1198
 5           5         0  44.675000            1198
 6           6         0  44.633333            1198
 7           7         0  44.611765            1198
 8           8         0  44.516667            1198
 9           9         0  44.441176            1198
 10         10         0  44.376923            1198
 11         11         0  44.092308            1198
 12         12         0  43.955556            1198
 13         13         0  43.775000            1198
 14         14         0  43.770588            1198
 15         15         0  43.515385            1198
 16         16         0  49.266667            1198
 17         17         0  44.486667            1198
 18         

### Train/Test Split

In [23]:
import sklearn as sk
from sklearn.model_selection import train_test_split

# Splitting dataset into training set & test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) #20% test size, random state gurantees same result across runs

### Model Training

In [24]:
from sklearn.linear_model import LinearRegression
#Fitting Multiple Linear Regression to the training set

regr = LinearRegression()
model =regr.fit(X_train, y_train)

In [25]:
#evaluate
from sklearn.metrics import mean_squared_error
# Evaluate our model using MSE criterion
y_pred = regr.predict(X_test)
mse=mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE): ", mse)
#the smaller the better

Mean Squared Error (MSE):  9.171870724746269


### Model Testing

In [26]:

#mean absolute percentage error
# measure of prediction accuracy of a forecasting
#commonly used as a loss function for regression problems
#the smaller the percentage the better
#see https://en.wikipedia.org/wiki/Mean_absolute_percentage_error

import numpy as np    #need numpy methods

actual, pred = np.array(y_test), np.array(y_pred)   #converts df to array using numpy methods
mape= np.mean(np.abs((actual - pred) / actual)) * 100
print(f"mean absolute percentage error= {mape: .2f}")

mean absolute percentage error=  4.29


In [27]:
#other measures: rmse, r-squre, mae
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

#alt notation
#print('Mean absolute error:', end="")
#print( "%0.2f" % mae)

print(f'Mean absolute error: {mae:.2f}')    #abs deviation of predicted value from true value
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

Mean absolute error: 2.43
Mean squared error: 9.17
Root mean squared error: 3.03


### Making Prediction Using Model

In [28]:
# Making predictions on the test set
y_pred = model.predict(X_test)

# Convert predictions to a DataFrame
y_pred_df = pd.DataFrame(y_pred, columns=['Predicted'], index=X_test.index)

# Concatenate the actual and predicted values
comparison_df = pd.concat([X_test, y_test.rename('Actual'), y_pred_df], axis=1)


# Create a label encoder object
le_timeOfDay = LabelEncoder()
le_location = LabelEncoder()

# Fit the 'timeOfDay' and 'location' columns
le_timeOfDay.fit(df_original['timeOfDay'])
le_location.fit(df_original['location'])

# Inverse transform the encoded 'timeOfDay' and 'location'
comparison_df['timeOfDay'] = le_timeOfDay.inverse_transform(comparison_df['timeOfDay'])
comparison_df['location'] = le_location.inverse_transform(comparison_df['location'])

# Print the DataFrame
print(comparison_df.head())

   timeOfDay   location   humidity  lightIntensity     Actual  Predicted
30     12:14  on_campus  61.266667            1198  89.400000  84.950128
36     12:20  on_campus  60.984211            1198  89.400000  88.748603
27      12:0  on_campus  34.500000            1198  40.800000  38.744640
4      11:19  on_campus  44.394118            1198  41.866667  38.692523
10     11:25  on_campus  44.376923            1198  41.900000  42.927775
