# Example usage

To use `simplefit` in a project,  import the package with following:

# Imports

In [1]:
import simplefit
from simplefit.cleaner import cleaner
from simplefit.regressor import regressor
from simplefit.classifier import classifier
from simplefit.eda import plot_distributions, plot_corr, plot_splom

import warnings
import pandas as pd
warnings.filterwarnings('ignore')
import altair as alt

alt.renderers.enable('html')  # render plot on html


RendererRegistry.enable('html')

In [2]:
print(simplefit.__version__)

0.1.0


In [3]:
# Specific altair rendering 
# alt.renderers.enable('notebook')   # to render plot on jupyter notebook
# alt.renderers.enable('mimetype')   # to render plot on github

# to enable altair to plot graphs if your dataset has more than 5000 rows, try any of these:
# alt.data_transformers.enable("data_server")

# OR

# from altair import pipe, limit_rows, to_values
# t = lambda data: pipe(data, limit_rows(max_rows=1000000), to_values)
# alt.data_transformers.register('custom', t)
# alt.data_transformers.enable('custom')


# Sample Data
We will be using the SpotifyFeatures.csv data as an example.

In [15]:
df = pd.read_csv("../tests/data/SpotifyFeatures.csv")

# Clean data 
Loads and cleans the dataset, removes NA rows, strips extra white spaces, etc and returns clean dataframe. Imports the cleaner function from the module `simplefit.cleaner`

In [16]:

clean_df = cleaner(df,lower_case=False)


In [17]:
clean_df

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.61100,0.389,99373,0.910,0.000000,C#,0.3460,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.24600,0.590,137373,0.737,0.000000,F#,0.1510,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.95200,0.663,170267,0.131,0.000000,C,0.1030,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.70300,0.240,152427,0.326,0.000000,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95000,0.331,82625,0.225,0.123000,F,0.2020,-21.150,Major,0.0456,140.576,4/4,0.390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232720,Soul,Slave,Son Of Slide,2XGLdVl7lGeq8ksM6Al7jT,39,0.00384,0.687,326240,0.714,0.544000,D,0.0845,-10.626,Major,0.0316,115.542,4/4,0.962
232721,Soul,Jr Thomas & The Volcanos,Burning Fire,1qWZdkBl4UVPj9lK6HuuFM,38,0.03290,0.785,282447,0.683,0.000880,E,0.2370,-6.944,Minor,0.0337,113.830,4/4,0.969
232722,Soul,Muddy Waters,(I'm Your) Hoochie Coochie Man,2ziWXUmQLrXTiYjCg2fZ2t,47,0.90100,0.517,166960,0.419,0.000000,D,0.0945,-8.282,Major,0.1480,84.135,4/4,0.813
232723,Soul,R.LUM.R,With My Words,6EFsue2YbIG4Qkq8Zr9Rir,44,0.26200,0.745,222442,0.704,0.000000,A,0.3330,-7.137,Major,0.1460,100.031,4/4,0.489


# Plot Distributions 
Creates numerical distribution plots on either all the numeric columns or the ones provided to it. Import the `plot_distributions` function from the module `simplefit.eda`


In [18]:
clean_df = clean_df[:5000]
plot_distributions(clean_df, bins = 40, dist_cols=['danceability', 'duration_ms', 'energy'])

# Plot Correlation plot
Creates correlation plot for all the columns in the dataframe

In [19]:
plot_corr(df, corr='spearman')


# Plot SPLOM
Creates `SPLOM` plot for all the numeric columns in the dataframe or the ones passed by the user

In [20]:
plot_splom(clean_df, pair_cols=["energy", "acousticness"])


# Fit Regressor
Preprocesses the data, fits baseline model(Dummy Regressor) and Ridge with default setup and returns model scores in the form of a dataframe

In [10]:
regressor(clean_df, target_col = 'popularity', numeric_feats = ['danceability', 'loudness'], categorical_feats=['genre'], cv=10)


Unnamed: 0,DummyRegressor,Ridge,RidgeCV,linearRegression
fit_time,0.001225,0.004901,0.033313,0.006932
score_time,0.000328,0.00207,0.001798,0.001586
test_score,-2.980655,-0.006898,-0.000396,-0.007574
train_score,0.0,0.898819,0.897763,0.898828


# Fit Classifier
Preprocesses the data, fits baseline model(Dummy Classifier) and Logistic Regression with default setup and returns model scores in the form of a dataframe

In [11]:
classification_df = pd.read_csv("../tests/data/adult.csv")
clean_df = cleaner(classification_df,lower_case=True)


First the classifier is passed with all the inputs

In [12]:
classifier(clean_df, target_col = 'income', numeric_feats = ['age', 'fnlwgt'], categorical_feats=['occupation'], cv=10)

Unnamed: 0,DummyClassifier,LogisticRegression
fit_time,0.012109,0.236336
score_time,0.003935,0.00636
test_score,0.75919,0.768619
train_score,0.75919,0.768912


Without passing any numeric features, it will train on all numeric features

In [13]:
classifier(clean_df, target_col = 'income', numeric_feats = [], categorical_feats=['occupation'], cv=10)

Unnamed: 0,DummyClassifier,LogisticRegression
fit_time,0.011936,0.214061
score_time,0.003959,0.006434
test_score,0.75919,0.792304
train_score,0.75919,0.821914
