Import Libraries

In [11]:
import pandas as pd
import numpy as np
from scipy import stats 

import plotly.express as px
import pandas_profiling as pp
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import style
import seaborn as sns

In [12]:
!pip install sweetviz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sweetviz
  Downloading sweetviz-2.1.4-py3-none-any.whl (15.1 MB)
[K     |████████████████████████████████| 15.1 MB 4.6 MB/s 
Installing collected packages: sweetviz
Successfully installed sweetviz-2.1.4


In [13]:
import sweetviz as sv

In [14]:
#importing other necessary libs
from datetime import datetime
import pytz
import time
import os

In [15]:
from autosklearn import regression,classification
import sklearn
from sklearn.model_selection import train_test_split
import autosklearn
from autosklearn.metrics import r2 as auto_r2, mean_squared_error as auto_mse
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,make_scorer

### Data Reading

In [17]:
df=pd.read_csv("/content/adult.csv",na_values=' ?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [20]:
(df.isna().sum()/len(df['age']))*100

age               0.000000
workclass         5.638647
fnlwgt            0.000000
education         0.000000
education-num     0.000000
marital-status    0.000000
occupation        5.660146
relationship      0.000000
race              0.000000
sex               0.000000
capital-gain      0.000000
capital-loss      0.000000
hours-per-week    0.000000
country           1.790486
salary            0.000000
dtype: float64

EDA Analysis (Plots in PDF)

In [21]:
### Plot handling functions

def convert_dtype(input_df,input_dict):
    """
    Use : Change data type as per requirement
    Return : DataFrame with corrected dType
    """
    output_df = input_df.astype(input_dict,)
    return output_df


def html_handler(input_str,filename,folder_name):
    """
    Use : Create / Update HTML file
    Return : None
    """ 
    
    try:
        with open(f"{folder_name}/{filename}.html",'a') as file:
            file.write(f"<h2 align ='center' style='color:red'> {input_str} </h2>")
            file.write("""<form> <input type="button" value="Go back!" onclick="history.back()"> </form>""")
    except Exception as e:
        pass
    return None


def plotly_to_html(fig,filename,folder_name):
    """
    Use : Store Plotly plot HTML file
    Return : None
    """
    
    try:
        with open(f"{folder_name}/{filename}.html", 'a') as f:
            f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))
    except Exception as e:
        pass
        
    return None

def html_create_index(input_str,filename,folder_name):
    """
    Use : Create index file for all the plotly plot
    Return : None
    """
    parent_name=folder_name.split("/")[0]    
    #logging.info(f"html_create_index called to create {folder_name}/{filename}.html and {parent_name}/EDA.html ")
    

    try:
        if os.path.exists(f"{parent_name}/EDA.html"):
            with open(f"{parent_name}/EDA.html",'a') as file:
                file.write(f"<li><a href={filename}/{filename}.html>{input_str}</a></li>")
        else:
            with open(f"{parent_name}/EDA.html",'a') as file:
                file.write(f"<h1 align ='center' style='color:red'> Complete EDA </h1>")
                file.write(f"<li><a href={filename}/{filename}.html>{input_str}</a></li>")


        with open(f"{folder_name}/{filename}.html",'a') as file:
            file.write(f"<h1 align ='center' style='color:red'> {input_str} </h1>")
            file.write("""<form> <input type="button" value="Go back!" onclick="history.back()"> </form>""")

        for f in os.listdir(f"{folder_name}"):
            if f.endswith('.html') and f!=f"{filename}.html":
                with open(f"{folder_name}/{filename}.html",'a') as file:
                    f_href=f.replace(" ","%20")
                    file.write(f"<li><a href={f_href}>{f}</a></li>")
            else:
                pass
    except Exception as e:
        pass

    return None

In [22]:
##*****************************************************************************************  
## ************************* Function for Categorical Target Variavble ************************


## ******Categorical Vs. Categorical Target variable

def cat_cat_var_plot(df,target_col,filename="Graph4",path="Graph4"):
    """
    Use : Create HTML Plot file for Categorical Vs. Categorical Target Variable
    and combine it with overall EDA graph HTML
    Return : None
    """
    
    path=f"DynamicPlot/{path}"
    if not os.path.exists(path):
        os.makedirs(path) 
    
    try:
        categorical_var_list=df.select_dtypes(include=["object"])
        for column in categorical_var_list:
            html_handler(input_str=column,filename=column,folder_name=path)
            inp_df=df.groupby(column)[target_col].count().reset_index(name=f'{column} : Count')
            
            try:
                
                fig1 = px.bar(inp_df, x=column, y=f'{column} : Count',color=column,
                        barmode="group",title=f'{column} : Count Plot')
                fig1=fig1.update_layout(title_x=0.5,
                                        title_font_family="Times New Roman",
                                        title_font_color="black",title_font_size=20)       
                plotly_to_html(fig=fig1,filename=column,folder_name=path)
            except Exception as e:
                pass

            try:
                
                inp_df=df.groupby([target_col,column])[[target_col]].count().rename(columns={target_col:f"{target_col}_count"})
                inp_df=inp_df.reset_index()
                fig2 = px.bar(inp_df, x=target_col, 
                                y=f'{target_col}_count',color=column,
                                barmode="group",title=f'{column} :Stacked Count Plot')
                fig2=fig2.update_layout(title_x=0.5,title_font_family="Times New Roman",
                                        title_font_color="black",title_font_size=20)
                plotly_to_html(fig=fig2,filename=column,folder_name=path)
            except Exception as e:
                pass

            try:
                
                inp_df=df.groupby(column)[target_col].count().reset_index(name=f'{column} : Count')
                fig3=px.pie(inp_df, values=f'{column} : Count', names=column, title=f'{column} : Pie Chart')
                fig3=fig3.update_traces(textposition='inside', textinfo='percent+label')
                fig3=fig3.update_layout(title_x=0.5,title_font_family="Times New Roman",title_font_color="black",
                            title_font_size=20)
                plotly_to_html(fig=fig3,filename=column,folder_name=path)
            except Exception as e:
                pass         

    except Exception as e:
                pass

    html_create_index(input_str="Categorical Variable Analysis",filename="Graph4",folder_name=path)
    
    return None

## ******Numerical Vs. Categorical Target variable

def num_cat_var_plot(df,target_col,filename="Graph5",path="Graph5"):
    """
    Use : Create HTML Plot file for Numerical Vs. Categorical Target Variable
    and combine it with overall EDA graph HTML
    Return : None
    """
    

    path=f"DynamicPlot/{path}"
    
    if not os.path.exists(path):
        os.makedirs(path) 
    

    try:
        numerical_var_list=df.select_dtypes(exclude=["object"])
        for column in numerical_var_list:
            
            html_handler(input_str=column,filename=column,folder_name=path)
            try:
                
                fig1=px.histogram(df, x=column,marginal="violin",
                                    hover_data=df.columns,title=f'{column} : Histogram Plot')
                fig1=fig1.update_layout(title_x=0.5,title_font_family="Times New Roman",
                                        title_font_color="black",title_font_size=20)        
                plotly_to_html(fig=fig1,filename=column,folder_name=path)
            except Exception as e:
                pass

            try:
                
                fig2=px.violin(df, y=column, points='all', box=True,title=f'{column} : Violin Plot')
                fig2=fig2.update_layout(title_x=0.5,title_font_family="Times New Roman",
                                title_font_color="black",title_font_size=20)
                plotly_to_html(fig=fig2,filename=column,folder_name=path)
            except Exception as e:
                pass

            try:
                
                fig3=px.violin(df, x=target_col,y=column,color=target_col, points='all', box=True,title=f'{column} : Violin Plot')
                fig3=fig3.update_layout(title_x=0.5,title_font_family="Times New Roman",
                                title_font_color="black",title_font_size=20)        
                plotly_to_html(fig=fig3,filename=column,folder_name=path)
            except Exception as e:
                pass

            try:
                
                inp_df=df.groupby(target_col)[column].agg(["mean","median"]).reset_index().rename(columns={'mean': f'Mean : {column}','median':f'Median : {column}'})
                fig4 = px.bar(inp_df, x=target_col, y=[f'Mean : {column}',f'Median : {column}'],
                        barmode="group",title=f'{column} : Mean Plot')
                fig4=fig4.update_layout(title_x=0.5,title_font_family="Times New Roman",
                                        title_font_color="black",title_font_size=20)
                plotly_to_html(fig=fig4,filename=column,folder_name=path) 
            except Exception as e:
                pass

    except Exception as e:
                pass
    
    html_create_index(input_str="Numerical Variable Analysis",filename="Graph5",folder_name=path)
    

    return None
    

## ******Null Vs. Categorical Target variable
def null_cat_var_plot(df,target_col,filename="Graph6",path="Graph6"):
    """
    Use : Create HTML Plot file for Null Vs. Categorical Target Variable
    and combine it with overall EDA graph HTML
    Return : None
    """
    

    path=f"DynamicPlot/{path}"

    if not os.path.exists(path):
        os.makedirs(path)

    try:
        null_var_list=df.columns[df.isnull().any()].tolist()
        null_var_list.append(target_col)
        null_df=df[null_var_list]

        for column in null_df.iloc[:,:-1]:
            
            html_handler(input_str=column,filename=column,folder_name=path)

            null_df[column]=np.where(null_df[column].isnull(),"Null","Not Null")
            inp_df=null_df.groupby(column)[target_col].count().reset_index()

            try:
                
                fig1 = px.bar(inp_df, x=column, y=target_col,color=column,
                                barmode="group",title=f'{column} : Null Value Count Plot')
                fig1=fig1.update_layout(title_x=0.5,title_font_family="Times New Roman",
                                title_font_color="black", title_font_size=15)
                plotly_to_html(fig=fig1,filename=column,folder_name=path)
            except Exception as e:
                pass

    except Exception as e:
                pass
    
    html_create_index(input_str="Null Value Analysis",filename="Graph6",folder_name=path)
    



##*****************************************************************************************  
## ************************* Multivariate Analysis ************************

def mul_var_plot(df,target_col,filename="Graph7",path="Graph7"):
    """
    Use : Create HTML Plot file for Multivariate Analysis
    and combine it with overall EDA graph HTML
    Return : None
    """
    
    path=f"DynamicPlot/{path}"
    if not os.path.exists(path):
        os.makedirs(path) 

    html_handler(input_str=f"Multivariate Analysis : Target Variable {target_col}",filename=filename,folder_name=path)

    

    try:
        #numerical_var_list=df.select_dtypes(exclude=["object"])
        corr_df = df.corr()
        corrSale_df=pd.DataFrame(corr_df[target_col])
        corrSale_df.reset_index(inplace=True)
        corrSale_df.sort_values(by=target_col,ascending=True,inplace=True)
        corrSale_df.dropna(inplace=True)
        corrSale_df.columns=["Feature","Pearson_Corr"]
        
        fig1 = px.bar(corrSale_df, y="Feature", x="Pearson_Corr",
                        color="Pearson_Corr",
                        barmode="group",
                        title=f'Correlation Plot-1 : Target Variable {target_col}')
        fig1=fig1.update_layout(title_x=0.5,
                                title_font_family="Times New Roman",
                                title_font_color="black",
                                title_font_size=20)
        plotly_to_html(fig=fig1,filename=filename,folder_name=path)
    except Exception as e:
                pass
    try:
        
        fig2=px.imshow(corr_df, text_auto=True, 
                        aspect="auto",
                        title=f'Correlation Plot-2 : Target Variable {target_col}')
        fig2=fig2.update_layout(title_x=0.5,
                            title_font_family="Times New Roman",
                            title_font_color="black",
                            title_font_size=20)
        plotly_to_html(fig=fig2,filename=filename,folder_name=path)
    except Exception as e:
                pass

    html_create_index(input_str="Multivariate Analysis : Target Variable",filename="Graph7",folder_name=path)


##*****************************************************************************************  
## ************************* SWEETVIZ and PANDAS PROFILING ********************************

def SweetViz_report(df,filename="Graph8",path="Graph8"):
    """
    Use : Create HTML plot using SweetViz
    and combine it with overall EDA graph HTML
    Return : None
    """
    

    path=f"DynamicPlot/{path}"
    if not os.path.exists(path):
        os.makedirs(path) 
    
    try:
        
        #html_handler(input_str="SweetViz Report",filename="SweetViz",folder_name=path)
        my_report=sv.analyze(df,feat_cfg=None)
        my_report.show_html(f"{path}/{filename}.html",open_browser=False)
        html_create_index(input_str="SweetViz Report",filename=filename,folder_name=path)
        #feature_config = sv.FeatureConfig(skip="", force_text=[""])
        #Compare Two DataFrame i.e. e.g. Test vs Training sets
        #my_report = sv.compare()
        # Comparing two subsets of the same dataframe (i.e. Male vs Female)
        #  my_report = sv.compare_intra()
    except Exception as e:
                pass
    return None


def PandaProfile_report(df,target_col,filename="Graph9",path="Graph9",minimal_ip=False):
    """
    Use : Create HTML plot using PandaProfiling
    and combine it with overall EDA graph HTML
    Return : None
    For Big data turn --> minimal_ip=True
    """
    
    path=f"DynamicPlot/{path}"
    if not os.path.exists(path):
        os.makedirs(path) 
    
    try:
        
        #html_handler(input_str="PandaProfiling Report",filename="PandaProfiling",folder_name=path)
        profile = pp.ProfileReport(df,title="PandaProfiling Report",explorative=True,minimal=minimal_ip)
        # minimal=True -->> For Big Dataset
        profile.to_file(f"{path}/{filename}.html")
        html_create_index(input_str="PandaProfiling Report",filename=filename,folder_name=path)
    except Exception as e:
        pass
    return None

In [24]:
cat_cat_var_plot(df,target_col="salary",filename="Graph4",path="Graph4")
num_cat_var_plot(df,target_col="salary",filename="Graph5",path="Graph5")
null_cat_var_plot(df,target_col="salary",filename="Graph6",path="Graph6")
SweetViz_report(df,filename="Graph8",path="Graph8")
PandaProfile_report(df,target_col="salary",filename="Graph9",path="Graph9",minimal_ip=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



                                             |          | [  0%]   00:00 -> (? left)

Report DynamicPlot/Graph8/Graph8.html was generated.


In [31]:
!zip -r /content/Static.zip /content/StaticPlot
#!rm -rf DynamicPlot
#!rm -rf StaticPlot

  adding: content/StaticPlot/ (stored 0%)
  adding: content/StaticPlot/null_cat_var_plot25-06-22_1638.pdf (deflated 53%)
  adding: content/StaticPlot/num_cat_var_plot25-06-22_1638.pdf (deflated 8%)
  adding: content/StaticPlot/cat_cat_var_plot25-06-22_1638.pdf (deflated 13%)


Static Graph function

In [27]:
def plot_size(graphsize):
    """
    Use : Plot size selection
    Return : Plot/Fig. Size
    """
    ## (Width*Height)
    if graphsize=="A4":
        fig_size=(11,8.5)
    elif graphsize=="A3" :
        fig_size=(16.5,12)
    elif graphsize=="mid":
        fig_size=(24,14)
    elif graphsize=="big":
        fig_size=(30,17.5)
    else:
        fig_size=(30,17.5) # By-Default big size
    
    return fig_size



## ******Categorical Vs. Categorical Target variable
def cat_cat_var_plot(df,target_col,graphsize="big",plot_style="ggplot"):
    """
    Use : Create PDF Plot file for
    Categorical Vs. Categorical Target Variable
    Return : None

    Graphsize=A4,A3,mid,big default=big
    #print(plt.style.available)
    """ 
    style.use(plot_style)
    
    path="StaticPlot"

    if not os.path.exists(path):
        os.makedirs(path)

    pdf_file_name= f"{path}/cat_cat_var_plot" + datetime.now(pytz.timezone('Asia/Kolkata')).strftime("%d-%m-%y_%H%M") + ".pdf"   
    graph_pdf = PdfPages(pdf_file_name)
    
 
    
    try:
        categorical_var_list=df.select_dtypes(include=["object"])

        for column in categorical_var_list:    
            work_figure = plt.figure(constrained_layout=False, figsize=plot_size(graphsize))
            grid = gridspec.GridSpec(ncols=2, nrows=2, figure=work_figure)

            try:
                
                ax1 = work_figure.add_subplot(grid[0,0]) ## 1st Row, 1 Column
                ax1.set_title(column.upper() + ' : Count Plot')
                sns.countplot(x=column,data = df,
                            order = df[column].value_counts().index,ax = ax1)
                plt.xticks(rotation=90)
                for p in ax1.patches:
                    ax1.annotate('{:.0f}'.format(p.get_height()),(p.get_x()+0.2, p.get_height()+2))
            except Exception as e:
                    pass

            try:
                
                ax2 = work_figure.add_subplot(grid[1, 0]) ## 2 nd Row, 1 Column
                ax2.set_title(column.upper() + ' : Stack Column Plot ')
                sns.countplot(x=column,data = df,
                            order = df[column].value_counts().index,
                            hue=target_col,ax = ax2)
                for p in ax2.patches:
                    ax2.annotate('{:.0f}'.format(p.get_height()),(p.get_x()+0.2, p.get_height()+2))
                #df_plot = df.groupby([target_col,column]).size().reset_index().pivot(columns=target_col, index=column, values=0)
                #df_plot.plot(kind='bar', stacked=True,ax = ax2)
                plt.xticks(rotation=90)
            except Exception as e:
                pass


            try: 
                    
                ax3 = work_figure.add_subplot(grid[:, 1]) ## Entire 2nd Column >>  [:, 1] This use complete Second Column
                df[column].value_counts().plot.pie(autopct = "%2.2f%%", ax=ax3)
                ax3.set_title(column.upper() + ' : Pie Plot')
            except Exception as e:
                pass

            work_figure.tight_layout()
            #plt.show()
            
            graph_pdf.savefig(work_figure)

    except Exception as e:
        pass

    graph_pdf.close()

    return None


## ******Numerical Vs. Categorical Target variable
def num_cat_var_plot(df,target_col,graphsize="big",plot_style="ggplot"):
    """
    Use : Create PDF Plot file for
    Numerical Vs. Categorical Target Variable
    Return : None

    Graphsize=A4,A3,mid,big default=big
    #print(plt.style.available)
    """
    
    
    style.use(plot_style)
    
    path="StaticPlot"

    if not os.path.exists(path):
        os.makedirs(path)

    pdf_file_name= f"{path}/num_cat_var_plot" + datetime.now(pytz.timezone('Asia/Kolkata')).strftime("%d-%m-%y_%H%M") + ".pdf"   
    graph_pdf = PdfPages(pdf_file_name)
    
 
    try:
        numerical_var_list=df.select_dtypes(exclude=["object"])
        for column in numerical_var_list:
            work_figure = plt.figure(constrained_layout=False, figsize=plot_size(graphsize))## Width , Height
            grid = gridspec.GridSpec(ncols=2, nrows=2, figure=work_figure)

            try:
                
                ax1 = work_figure.add_subplot(grid[0, :1])
                ax1.set_title(column.upper()+': Density Plot')
                sns.distplot(df[column],ax = ax1)
            except Exception as e:
                pass


            try:
                
                ax2 = work_figure.add_subplot(grid[1, :1])
                plt.hist(data = df,x=column)
                plt.title(column.upper()+' : Histogram',ax = ax2)
            except Exception as e:
                pass

            try:
                
                ax3 = work_figure.add_subplot(grid[0, 1])
                sns.boxplot(df[column], ax=ax3)
                plt.title(column.upper() + " : Box Plot")
            except Exception as e:
                pass

            try:
                
                ax4 = work_figure.add_subplot(grid[1, 1])
                ax4.set_title(column.upper() + ' : Box Plot ')
                sns.boxplot(x=target_col, y=column, data = df,
                            order = df[target_col].value_counts().index,ax=ax4)
                plt.xticks(rotation=90)
            except Exception as e:
                pass

            work_figure.tight_layout() 
            #plt.show()
            
            graph_pdf.savefig(work_figure)

    except Exception as e:
        pass

    graph_pdf.close()

    return None


## ******Null Vs. Categorical Target variable
def null_cat_var_plot(df,target_col,graphsize="big",plot_style="ggplot"):
    """
    Use : Create PDF Plot file for
    Null Vs. Categorical Target Variable
    Return : None

    Graphsize=A4,A3,mid,big default=big
    #print(plt.style.available)
    """
    
    
    style.use(plot_style)
    
    path="StaticPlot"

    if not os.path.exists(path):
        os.makedirs(path)

    pdf_file_name= f"{path}/null_cat_var_plot" + datetime.now(pytz.timezone('Asia/Kolkata')).strftime("%d-%m-%y_%H%M") + ".pdf"   
    graph_pdf = PdfPages(pdf_file_name)
    

    try:
        for column in df:
            df_temp=df.copy()
            df_temp[column]=np.where(df[column].isnull(),"Null","Not Null")
            grouped_data=df_temp.groupby(column)[target_col]
            
        if (df[column].isnull().sum()>=1):     
            work_figure = plt.figure(constrained_layout=False, figsize=plot_size(graphsize))
            grid = gridspec.GridSpec(ncols=2, nrows=2, figure=work_figure)

            try:
                
                ax1 = work_figure.add_subplot(grid[0,0]) ## 1st Row, 1 Column
                ax1.set_title(column.upper() + ' : Null Value COUNT Analysis')
                grouped_data.count().plot.bar(ax = ax1)
                for p in ax1.patches:
                    ax1.annotate('{:.0f}'.format(p.get_height()),(p.get_x()+0.2, p.get_height()+2))
            except Exception as e:
                pass

            try:
                
                ax2 = work_figure.add_subplot(grid[0,1])
                ax2.set_title(column.upper() + ' : Null Value Count Analysis')
                
                Null_val_per=f"Null Value % : {np.round(df[column].isnull().mean()*100,2)}" 

                ax2.text(0.5,0.5,Null_val_per,
                        horizontalalignment='left',
                        verticalalignment='center',
                        transform = ax2.transAxes,color='b', weight='bold',fontsize = 15)  
            except Exception as e:
                pass

            try:
                
                ax3 = work_figure.add_subplot(grid[1,0])
                ax3.set_title('Not Applicable')
            except Exception as e:
                pass
            
            try:
               
                ax4 = work_figure.add_subplot(grid[1,1])
                ax4.set_title('Not Applicable')
            except Exception as e:
                pass
            
            work_figure.tight_layout(pad=10, w_pad=10, h_pad=10.0)
            #plt.show()
            
            graph_pdf.savefig(work_figure) #papertype=graphsize

    except Exception as e:
        pass

    graph_pdf.close()

    return None

In [29]:
cat_cat_var_plot(df,target_col="salary",graphsize="A3",plot_style="ggplot")
num_cat_var_plot(df,target_col="salary",graphsize="A3",plot_style="ggplot")
null_cat_var_plot(df,target_col="salary",graphsize="A3",plot_style="ggplot")


`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).


Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.


`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).


Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.


`distplot` is a deprecated function and will be removed in a future version. Please adapt

In [30]:
!zip -r /content/StaticPlot.zip /content/DynamicPlot

  adding: content/DynamicPlot/ (stored 0%)
  adding: content/DynamicPlot/EDA.html (deflated 49%)
  adding: content/DynamicPlot/Graph9/ (stored 0%)
  adding: content/DynamicPlot/Graph5/ (stored 0%)
  adding: content/DynamicPlot/Graph5/Graph5.html (deflated 53%)
  adding: content/DynamicPlot/Graph5/capital-gain.html (deflated 90%)
  adding: content/DynamicPlot/Graph5/education-num.html (deflated 89%)
  adding: content/DynamicPlot/Graph5/fnlwgt.html (deflated 87%)
  adding: content/DynamicPlot/Graph5/hours-per-week.html (deflated 90%)
  adding: content/DynamicPlot/Graph5/capital-loss.html (deflated 91%)
  adding: content/DynamicPlot/Graph5/age.html (deflated 89%)
  adding: content/DynamicPlot/Graph8/ (stored 0%)
  adding: content/DynamicPlot/Graph8/Graph8.html (deflated 61%)
  adding: content/DynamicPlot/Graph4/ (stored 0%)
  adding: content/DynamicPlot/Graph4/relationship.html (deflated 91%)
  adding: content/DynamicPlot/Graph4/salary.html (deflated 88%)
  adding: content/DynamicPlot/Gra

Model Building

In [32]:
## Imputing Null values using KNN imputer
#from sklearn.impute import KNNImputer
#imputer = KNNImputer(n_neighbors=2)
#After_imputation = imputer.fit_transform(Before_imputation)

In [33]:
#!pip install auto-sklearn

In [34]:
## replacing Null values with NAN

df["workclass"].fillna(df["workclass"].mode()[0], inplace=True)
df["occupation"].fillna(df["occupation"].mode()[0], inplace=True)
df["country"].fillna(df["country"].mode()[0], inplace=True)

In [35]:
X=df.drop("salary",axis=1)
one_hot_encoded_data = pd.get_dummies(X, columns = ['workclass', 'education','marital-status','occupation','relationship','race','sex','country'])
y=df["salary"]
y=y.replace(" >50K",1)
y=y.replace(" <=50K",0)

In [37]:
y=df["salary"]
y=y.replace(" >50K",1)
y=y.replace(" <=50K",0)

In [38]:
X_train,X_val,y_train,y_val = train_test_split(one_hot_encoded_data,y,test_size=0.3,random_state=42)

In [39]:
automl = classification.AutoSklearnClassifier(
    time_left_for_this_task=60*10,
    per_run_time_limit=2*60)

In [40]:
automl.fit(X_train, y_train)

AutoSklearnClassifier(per_run_time_limit=120, time_left_for_this_task=600)

In [41]:
# summarizing
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: 4935282c-f478-11ec-80c9-0242ac1c0002
  Metric: accuracy
  Best validation score: 0.875698
  Number of target algorithm runs: 17
  Number of successful target algorithm runs: 11
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 3
  Number of target algorithms that exceeded the memory limit: 3



In [42]:
pd.DataFrame(automl.leaderboard(detailed = True, ensemble_only=False))

Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration,config_id,train_loss,seed,start_time,end_time,budget,status,data_preprocessors,feature_preprocessors,balancing_strategy,config_origin
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5,1,0.14,gradient_boosting,0.124302,14.34935,4,0.092796,0,1656156000.0,1656156000.0,0.0,StatusType.SUCCESS,[],[select_rates_classification],none,Initial design
10,2,0.18,gradient_boosting,0.126562,18.072823,9,0.119581,0,1656156000.0,1656156000.0,0.0,StatusType.SUCCESS,[],[feature_agglomeration],none,Initial design
9,3,0.54,adaboost,0.130284,33.099041,8,0.121938,0,1656156000.0,1656156000.0,0.0,StatusType.SUCCESS,[],[feature_agglomeration],none,Initial design
15,4,0.08,adaboost,0.135336,13.47547,14,0.131041,0,1656156000.0,1656156000.0,0.0,StatusType.SUCCESS,[],[no_preprocessing],none,Initial design
2,5,0.02,random_forest,0.144376,23.625458,1,0.0,0,1656156000.0,1656156000.0,0.0,StatusType.SUCCESS,[],[no_preprocessing],none,Initial design
3,6,0.02,mlp,0.157405,77.990416,2,0.084807,0,1656156000.0,1656156000.0,0.0,StatusType.SUCCESS,[],[feature_agglomeration],weighting,Initial design
8,7,0.0,random_forest,0.194895,35.648554,7,0.149902,0,1656156000.0,1656156000.0,0.0,StatusType.SUCCESS,[],[pca],none,Initial design
6,8,0.02,gradient_boosting,0.205132,9.415527,5,0.035887,0,1656156000.0,1656156000.0,0.0,StatusType.SUCCESS,[],[feature_agglomeration],weighting,Initial design
16,9,0.0,random_forest,0.242489,5.759637,15,0.242502,0,1656156000.0,1656156000.0,0.0,StatusType.SUCCESS,[],[feature_agglomeration],none,Initial design
11,10,0.0,gradient_boosting,0.245015,20.392567,10,0.242109,0,1656156000.0,1656156000.0,0.0,StatusType.SUCCESS,[],[feature_agglomeration],none,Initial design


In [43]:
## For all the model details
from pprint import pprint
pprint(automl.show_models(),indent=4)

{   2: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fe8251282d0>,
           'cost': 0.14437649561286892,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fe8253b1490>,
           'ensemble_weight': 0.02,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fe825215290>,
           'model_id': 2,
           'rank': 5,
           'sklearn_classifier': RandomForestClassifier(max_features=10, n_estimators=512, n_jobs=1,
                       random_state=1, warm_start=True)},
    3: {   'balancing': Balancing(random_state=1, strategy='weighting'),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fe8250d6b90>,
           'cost': 0.15740494549321993,
           'data_preprocessor': <autosklea

In [44]:
## For Selected Model
pprint(automl.get_models_with_weights(),indent=4)

[   (   0.54,
        SimpleClassificationPipeline({'balancing:strategy': 'none', 'classifier:__choice__': 'adaboost', 'data_preprocessor:__choice__': 'feature_type', 'feature_preprocessor:__choice__': 'feature_agglomeration', 'classifier:adaboost:algorithm': 'SAMME', 'classifier:adaboost:learning_rate': 0.4034077156997028, 'classifier:adaboost:max_depth': 7, 'classifier:adaboost:n_estimators': 280, 'data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'most_frequent', 'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'quantile_transformer', 'feature_preprocessor:feature_agglomeration:affinity': 'cosine', 'feature_preprocessor:feature_agglomeration:linkage': 'average', 'feature_preprocessor:feature_agglomeration:n_clusters'

In [45]:
## Cross Validation results
pd.DataFrame(automl.cv_results_)

Unnamed: 0,mean_test_score,mean_fit_time,params,rank_test_scores,status,budgets,param_balancing:strategy,param_classifier:__choice__,param_data_preprocessor:__choice__,param_feature_preprocessor:__choice__,...,param_data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_max,param_data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_min,param_feature_preprocessor:fast_ica:n_components,param_feature_preprocessor:kernel_pca:coef0,param_feature_preprocessor:kernel_pca:degree,param_feature_preprocessor:kernel_pca:gamma,param_feature_preprocessor:nystroem_sampler:coef0,param_feature_preprocessor:nystroem_sampler:degree,param_feature_preprocessor:nystroem_sampler:gamma,param_feature_preprocessor:select_rates_classification:mode
0,0.855624,23.625458,"{'balancing:strategy': 'none', 'classifier:__c...",5,Success,0.0,none,random_forest,feature_type,no_preprocessing,...,,,,,,,,,,
1,0.842595,77.990416,"{'balancing:strategy': 'weighting', 'classifie...",6,Success,0.0,weighting,mlp,feature_type,feature_agglomeration,...,,,,,,,,,,
2,0.0,0.965316,"{'balancing:strategy': 'none', 'classifier:__c...",12,Memout,0.0,none,gradient_boosting,feature_type,polynomial,...,,,,,,,,,,
3,0.875698,14.34935,"{'balancing:strategy': 'none', 'classifier:__c...",1,Success,0.0,none,gradient_boosting,feature_type,select_rates_classification,...,,,,,,,,,,fpr
4,0.794868,9.415527,"{'balancing:strategy': 'weighting', 'classifie...",8,Success,0.0,weighting,gradient_boosting,feature_type,feature_agglomeration,...,,,,,,,,,,
5,0.0,3.017867,"{'balancing:strategy': 'none', 'classifier:__c...",12,Memout,0.0,none,random_forest,feature_type,polynomial,...,,,,,,,,,,
6,0.805105,35.648554,"{'balancing:strategy': 'none', 'classifier:__c...",7,Success,0.0,none,random_forest,feature_type,pca,...,,,,,,,,,,
7,0.869716,33.099041,"{'balancing:strategy': 'none', 'classifier:__c...",3,Success,0.0,none,adaboost,feature_type,feature_agglomeration,...,,,,,,,,,,
8,0.873438,18.072823,"{'balancing:strategy': 'none', 'classifier:__c...",2,Success,0.0,none,gradient_boosting,feature_type,feature_agglomeration,...,,,,,,,,,,
9,0.754985,20.392567,"{'balancing:strategy': 'none', 'classifier:__c...",10,Success,0.0,none,gradient_boosting,feature_type,feature_agglomeration,...,,,,,,,,,,


Prediction

In [46]:
train_pred = automl.predict(X_train)
test_pred = automl.predict(X_val)

In [49]:
from sklearn.metrics import f1_score

In [51]:
print("F1 Score for Validation Data",f1_score(y_val, test_pred))
print("F1 Score for Training Data",f1_score(y_train, train_pred))

F1 Score for Validation Data 0.7071797278273111
F1 Score for Training Data 0.7666797488226059


In [52]:
from sklearn.metrics import accuracy_score

In [53]:
print("Accuracy Score for Validation Data",accuracy_score(y_val, test_pred))
print("Accuracy Score for Validation Data",accuracy_score(y_train, train_pred))

Accuracy Score for Validation Data 0.8722489507626164
Accuracy Score for Validation Data 0.8956651456651457


## F1 Score (Validation Data) = 0.7071797278273111
## F1 Score (Training Data) = 0.7666797488226059

## Accuracy Score for Validation Data 0.8722489507626164
## Accuracy Score for Validation Data 0.8956651456651457