# Example usage

To use `simplefit` in a project:

# Imports

In [1]:
import simplefit
import altair as alt
import warnings
import pandas as pd
warnings.filterwarnings('ignore')
alt.data_transformers.enable('data_server')
alt.renderers.enable('altair_saver', fmts=['vega-lite', 'svg'])
alt.renderers.enable('html')
print(simplefit.__version__)

ValueError: 
To use the 'notebook' renderer, you must install the vega package
and the associated Jupyter extension.
See https://altair-viz.github.io/getting_started/installation.html
for more information.


# Sample Data
We will be using the SpotifyFeatures.csv data as an example.

In [6]:
df = pd.read_csv("../tests/data/SpotifyFeatures.csv")

df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


# Clean data 
Loads and cleans the dataset, removes NA rows, strips extra white spaces, etc and returns clean dataframe. Imports the cleaner function from the module `simplefit.cleaner`

In [7]:
from simplefit.cleaner import cleaner
clean_df = cleaner(df,lower_case=False)
clean_df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


# Plot Distributions 
Creates numerical distribution plots on either all the numeric columns or the ones provided to it. Import the `plot_distributions` function from the module `simplefit.eda`


In [8]:
from simplefit.eda import plot_distributions, plot_corr, plot_splom

In [9]:
dist_chart = plot_distributions(clean_df, bins = 40, dist_cols=['danceability', 'duration_ms', 'energy'])

In [10]:
dist_chart

# Plot Correlation plot
Creates correlation plot for all the columns in the dataframe

In [11]:
corr_chart = plot_corr(df, corr='spearman')
corr_chart

# Plot SPLOM
Creates `SPLOM` plot for all the numeric columns in the dataframe or the ones passed by the user

In [12]:
splom_chart = plot_splom(df, pair_cols=["energy", "acousticness"])
splom_chart

# Fit Regressor
Preprocesses the data, fits baseline model(Dummy Regressor) and Ridge with default setup and returns model scores in the form of a dataframe

In [13]:
from simplefit.regressor import regressor
regression_results = regressor(clean_df, target_col = 'popularity', numeric_feats = ['danceability', 'loudness'], categorical_feats=['genre'], cv=10)


In [14]:
regression_results

Unnamed: 0,DummyRegressor,Ridge,RidgeCV,linearRegression
fit_time,0.02527,0.28036,1.995253,0.218474
score_time,0.001239,0.021163,0.015575,0.014225
test_score,-0.353937,0.141713,0.14567,0.14342
train_score,0.0,0.7259,0.72589,0.725904


# Fit Classifier
Preprocesses the data, fits baseline model(Dummy Classifier) and Logistic Regression with default setup and returns model scores in the form of a dataframe

In [15]:
from simplefit.classifier import classifier

In [16]:
classification_df = pd.read_csv("../tests/data/adult.csv")
clean_classification_df = cleaner(classification_df,lower_case=True)


First the classifier is passed with all the inputs

In [17]:
classification_results = classifier(clean_classification_df, target_col = 'income', numeric_feats = ['age', 'fnlwgt'], categorical_feats=['occupation'], cv=10)

In [18]:
classification_results

Unnamed: 0,DummyClassifier,LogisticRegression
fit_time,0.015987,0.269145
score_time,0.005939,0.010424
test_score,0.75919,0.768619
train_score,0.75919,0.768912


We will now pass no values in the numeric features input to test the function. The function will pick up all numeric type columns

In [19]:
classification_results = classifier(clean_classification_df, target_col = 'income', numeric_feats = [], categorical_feats=['occupation'], cv=10)

In [20]:
classification_results

Unnamed: 0,DummyClassifier,LogisticRegression
fit_time,0.033082,0.550179
score_time,0.008742,0.015839
test_score,0.75919,0.792304
train_score,0.75919,0.821914
