```python
Data Hackerman Final Project
By Richard Adeyeye
18-April-2023
```

In [1]:
import pandas as pd
import requests
import json
import os
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import seaborn as sns
import folium


pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.options.display.max_colwidth = None
pd.set_option("display.float_format", lambda x: '%.2f' % x)

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Model Creation

In [2]:
# Main working directory 
main_working_folder = os.getcwd()

# Data folder directory
data_folder = "data"

# Combining the paths
main_path = os.path.join(main_working_folder, data_folder)

In [3]:
project_data_df = pd.read_csv(os.path.join(main_path, 'project_data.csv'))

In [4]:
project_data_df.head(1)

Unnamed: 0,author.properties.friends,author.properties.status_count,author.properties.verified,content.body,location.country,properties.platform,properties.sentiment,location.latitude,location.longitude
0,1689,22566.0,False,Can't believe I'm missing Love Island 😩,GB,twitter,1.0,51.57,0.46


In [5]:
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

In [6]:
train_data, test_data = train_test_split(project_data_df, test_size=0.33, random_state=42)

In [7]:
train_data.shape, test_data.shape

((3349, 9), (1650, 9))

In [8]:
train_data.head(2)

Unnamed: 0,author.properties.friends,author.properties.status_count,author.properties.verified,content.body,location.country,properties.platform,properties.sentiment,location.latitude,location.longitude
1556,288,2075.0,False,@brightsideram we tried nice men never worked heard bad reports he likes a drink more than players U tube clip outside pub ostrich comment,GB,twitter,0.0,53.09,-1.39
1003,1845,19394.0,False,...even better if time travel were invented and I could go there back in 1876 and have a whiskey with Al #Deadwood,GB,twitter,-1.0,51.65,-3.8


In [9]:
test_data.head(1)

Unnamed: 0,author.properties.friends,author.properties.status_count,author.properties.verified,content.body,location.country,properties.platform,properties.sentiment,location.latitude,location.longitude
84,859,36929.0,False,@JR_athletics @phil_walker spot on.,GB,twitter,-1.0,50.45,-3.55


### Training

In [10]:
%%time

save_path = 'artefacts/models_multiclass'
time_limit = 60
label = "properties.sentiment"

predictor = TabularPredictor(label=label, path=save_path, problem_type='multiclass').fit(test_data, time_limit=time_limit)

Beginning AutoGluon training ... Time limit = 60s
AutoGluon will save models to "artefacts/models_multiclass\"
AutoGluon Version:  0.7.0
Python Version:     3.9.16
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Train Data Rows:    1650
Train Data Columns: 8
Label Column: properties.sentiment
Preprocessing data ...
Train Data Class Count: 3
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2799.32 MB
	Train Data (Original)  Memory Usage: 0.55 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 1 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fittin

CPU times: total: 1min 30s
Wall time: 39.5 s


In [11]:
predictor.evaluate(test_data, silent=True)

{'accuracy': 0.9472727272727273,
 'balanced_accuracy': 0.9406344700352043,
 'mcc': 0.9160283606053036}

In [12]:
predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.95,0.75,0.63,0.28,17.49,0.02,0.0,0.68,2,True,14
1,LightGBMLarge,0.94,0.72,0.11,0.02,5.66,0.11,0.02,5.66,1,True,13
2,RandomForestGini,0.94,0.71,0.19,0.08,0.89,0.19,0.08,0.89,1,True,6
3,ExtraTreesGini,0.94,0.71,0.17,0.09,0.8,0.17,0.09,0.8,1,True,9
4,RandomForestEntr,0.94,0.71,0.14,0.08,0.81,0.14,0.08,0.81,1,True,7
5,XGBoost,0.94,0.7,0.2,0.02,3.06,0.2,0.02,3.06,1,True,11
6,LightGBMXT,0.94,0.72,0.08,0.02,1.91,0.08,0.02,1.91,1,True,4
7,ExtraTreesEntr,0.94,0.69,0.17,0.09,0.8,0.17,0.09,0.8,1,True,10
8,LightGBM,0.93,0.71,0.05,0.0,1.27,0.05,0.0,1.27,1,True,5
9,KNeighborsDist,0.9,0.5,0.02,0.03,0.03,0.02,0.03,0.03,1,True,2


In [13]:
predictor.get_model_best()

'WeightedEnsemble_L2'

### Evaluation

```python
Feature Importance
```

In [14]:
y_test = test_data[label]  # values to be predicted
test_data_nolab = test_data.drop(columns=[label])   

In [15]:
test_data_nolab.head(1)

Unnamed: 0,author.properties.friends,author.properties.status_count,author.properties.verified,content.body,location.country,properties.platform,location.latitude,location.longitude
84,859,36929.0,False,@JR_athletics @phil_walker spot on.,GB,twitter,50.45,-3.55


In [16]:
# `predictor.path` is another way to get the relative path needed to later load predictor.
save_model_predictor = TabularPredictor.load(save_path) 

In [17]:
# To find out the required feature columns to make predictions
save_model_predictor.features()

['author.properties.friends',
 'author.properties.status_count',
 'author.properties.verified',
 'content.body',
 'location.country',
 'location.latitude',
 'location.longitude']

### Prediction

In [18]:
y_pred = save_model_predictor.predict(test_data_nolab)

In [19]:
# returns a DataFrame that shows which probability corresponds to which class
y_pred_prob = save_model_predictor.predict_proba(test_data_nolab)

In [20]:
y_pred_prob[1:5]

Unnamed: 0,-1.00,0.00,1.00
2470,0.88,0.04,0.08
2804,0.09,0.79,0.12
4987,0.07,0.89,0.04
4924,0.42,0.08,0.5


In [21]:
predictor.get_model_best()

'WeightedEnsemble_L2'

In [22]:
y_pred[1:5]

2470   -1.00
2804    0.00
4987    0.00
4924    1.00
Name: properties.sentiment, dtype: float64

In [23]:
y_test[1:5]

2470   -1.00
2804    0.00
4987    0.00
4924    1.00
Name: properties.sentiment, dtype: float64

In [24]:
# To show scores for other metrics, you can specify the extra_metrics argument when passing in test_data
save_model_predictor.leaderboard(test_data, extra_metrics=['accuracy', 'balanced_accuracy', 'log_loss'], silent=True)

Unnamed: 0,model,score_test,accuracy,balanced_accuracy,log_loss,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.95,0.95,0.94,-0.32,0.75,0.56,0.28,17.49,0.02,0.0,0.68,2,True,14
1,LightGBMLarge,0.94,0.94,0.94,-0.18,0.72,0.08,0.02,5.66,0.08,0.02,5.66,1,True,13
2,RandomForestGini,0.94,0.94,0.93,-0.29,0.71,0.16,0.08,0.89,0.16,0.08,0.89,1,True,6
3,ExtraTreesGini,0.94,0.94,0.93,-0.29,0.71,0.13,0.09,0.8,0.13,0.09,0.8,1,True,9
4,RandomForestEntr,0.94,0.94,0.93,-0.29,0.71,0.14,0.08,0.81,0.14,0.08,0.81,1,True,7
5,XGBoost,0.94,0.94,0.93,-0.19,0.7,0.13,0.02,3.06,0.13,0.02,3.06,1,True,11
6,LightGBMXT,0.94,0.94,0.93,-0.28,0.72,0.08,0.02,1.91,0.08,0.02,1.91,1,True,4
7,ExtraTreesEntr,0.94,0.94,0.93,-0.29,0.69,0.16,0.09,0.8,0.16,0.09,0.8,1,True,10
8,LightGBM,0.93,0.93,0.92,-0.34,0.71,0.05,0.0,1.27,0.05,0.0,1.27,1,True,5
9,KNeighborsDist,0.9,0.9,0.89,-0.84,0.5,0.02,0.03,0.03,0.02,0.03,0.03,1,True,2


In [25]:
# The predictor also remembers what metric predictions should be evaluated with
perf = save_model_predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred_prob)

Evaluation: accuracy on test data: 0.9472727272727273
Evaluations on test data:
{
    "accuracy": 0.9472727272727273,
    "balanced_accuracy": 0.9406344700352043,
    "mcc": 0.9160283606053036
}


In [26]:
# Alternatively since the label columns remains in the test_data DataFrame 
perf = save_model_predictor.evaluate(test_data)

Evaluation: accuracy on test data: 0.9472727272727273
Evaluations on test data:
{
    "accuracy": 0.9472727272727273,
    "balanced_accuracy": 0.9406344700352043,
    "mcc": 0.9160283606053036
}


### Input for Prediction

In [27]:
project_data_df.head(1)

Unnamed: 0,author.properties.friends,author.properties.status_count,author.properties.verified,content.body,location.country,properties.platform,properties.sentiment,location.latitude,location.longitude
0,1689,22566.0,False,Can't believe I'm missing Love Island 😩,GB,twitter,1.0,51.57,0.46


In [28]:
project_data_df['content.body'].unique()

array(["Can't believe I'm missing Love Island 😩",
       "Last tweet about future wedding..... if I actually want a wedding I actually need to find a guy XD we all know I'm a loner. unlovable",
       'How many times does he wonna say the phrase "i deal with shit" #LoveIsland',
       ..., '#NP Shola Ama - Loving My Baby',
       'Could not have had a worse couple days. Failed MOT, lost my passport, and now just found out I sent £300 for my Napa Accom to the wrong acc.',
       "@FHPReading Hi guys, we're from Reading and would love to take your branding to a new level. Get in touch and we'll chat more! #snapchat"],
      dtype=object)

In [29]:
project_data_df['properties.sentiment'].unique()

array([ 1., -1.,  0.])

In [30]:
# save_model_predictor.predict(project_data_df)

In [32]:
save_model_predictor.predict(project_data_df)[0]

1.0

### Create Sample Data

In [33]:
sample_data_dict = {
    
    "author.properties.friends": 114,
    "author.properties.status_count": 1377,
    "author.properties.verified": "True",
    "content.body": "Can't believe I'm missing Love Island 😩",
    "location.country": 'NG',
    "properties.platform": 'facebook',
    "location.latitude": 52.96974444,
    "location.longitude": -1.172266
    
}

In [34]:
sample_data = pd.DataFrame([sample_data_dict])

In [35]:
sample_data.head()

Unnamed: 0,author.properties.friends,author.properties.status_count,author.properties.verified,content.body,location.country,properties.platform,location.latitude,location.longitude
0,114,1377,True,Can't believe I'm missing Love Island 😩,NG,facebook,52.97,-1.17


In [36]:
save_model_predictor.predict(sample_data)

0   1.00
Name: properties.sentiment, dtype: float64

In [37]:
save_model_predictor.predict(sample_data)[0]

1.0