In [1]:
import pandas as pd
import numpy as np

In [2]:
def stats_description(df, col1, col2):
    '''
    Input: 
    df - Preprocessed Seattle listings dataframe
    Col1 = categorical column in the df
    col2 = numeric column column in the df
    
    output: 
    sats_summary = A table showing the statistical summary of features of col1 in relation col2
    Sats_summary columns = Mean, median, standard deviation(std),maximum, minimum, 5th & 95th percentile,
                            count of col1 and percentage share of col1

    This function performs counts on categorical variables
    1. Group the dataframe by the categorical variable in relation to a numeric varible.
    2. Perform operation on the groupby dataframe (mean, median,std,max,min, quantiles([.5,95]))
    3. Perform a count on the categorical column(col1) and calculate the percentage.
    4. Merge all the statistcial features into a new dataframe
    '''   
    dict1 = df.groupby(col1)[col2].mean().to_dict()
    dict2 = df.groupby(col1)[col2].median().to_dict()
    dict3 = df.groupby(col1)[col2].std().to_dict()
    dict4 = df.groupby(col1)[col2].max().to_dict()
    dict5 = df.groupby(col1)[col2].min().to_dict()
    dict6 = df.groupby(col1)[col2].quantile(.5).to_dict()
    dict7 = df.groupby(col1)[col2].quantile(.95).to_dict()
    dicts = [dict1, dict2, dict3, dict4, dict5,dict6,dict7]
    result_dict = {}
    for d in dicts:
        for k, v in d.items():
            try:
                result_dict.setdefault(k,[]).extend(v)
            except TypeError:
                result_dict[k].append(v)
    
    sats_summary = pd.DataFrame(result_dict, index = ['mean_' + col2, 'median_' + col2, 'std_' + col2, 'max_' + col2, 'min_' + col2, 
                                                      '5th perc_' + col2 ,'95th perc_' + col2 ]).T
    
    return sats_summary

In [4]:
def lm_mod(X, y,test_size = .30, random_state=42, plot=True):
    
    '''
    INPUT
    X - pandas dataframe, X matrix
    y - pandas dataframe, response variable
    test_size - float between 0 and 1, default 0.3, determines the proportion of data as test data
    random_state - int, default 42, controls random state for train_test_split
    plot - boolean, default 0.3, True to plot result
    
    OUTPUT
    r2_scores_test - list of floats of r2 scores on the test data
    r2_scores_train - list of floats of r2 scores on the train data
    lm_model - model object from sklearn
    X_train, X_test, y_train, y_test - output from sklearn train test split used for optimal model
    Plots - Predictions & Actual and Predictions & Differences
    Table - Features' importance arranged in descending orders of the absolute coefficients.
    '''
    
    # Split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42);
    
    # Fit the model and obtain pred response
    lm_model = LinearRegression(normalize=True) 
    lm_model.fit(X_train, y_train)
    y_test_preds = lm_model.predict(X_test)
    y_train_preds = lm_model.predict(X_train)
    
    # r2 and mse values from the train and test sets
    r2_scores_train=r2_score(y_train, y_train_preds)
    print ("r2_scores_train = {}".format(r2_scores_train))
    r2_scores_test=r2_score(y_test, y_test_preds)
    print ("r2_scores_test = {}".format(r2_scores_test))
    
    
    mse_score_train=mean_squared_error(y_train, y_train_preds)
    print ("mse_score_train = {}".format(mse_score_train))
    mse_score_test=mean_squared_error(y_test, y_test_preds)
    print ("mse_score_test = {}".format(mse_score_test))
    
    preds_vs_act = pd.DataFrame(np.hstack([y_test.values.reshape(y_test.size,1), y_test_preds.reshape(y_test.size,1)]));
    preds_vs_act.columns = ['actual', 'preds'];
    preds_vs_act['diff'] = preds_vs_act['actual'] - preds_vs_act['preds'];


    plt.plot(preds_vs_act['preds'], preds_vs_act['actual'], 'bo');
    plt.xlabel('predicted');
    plt.ylabel('Actual');
    plt.title('Predictions & Actual');
    plt.show()
    
    plt.plot(preds_vs_act['preds'], preds_vs_act['diff'], 'ro');
    plt.xlabel('predicted');
    plt.ylabel('diff');
    plt.title('Predictions & Difference');
    plt.show()
    
    coefs_df = pd.DataFrame()

    coefs_df['est_int'] = X_train.columns
    coefs_df['coefs'] = lm_model.coef_
    coefs_df['abs_coefs'] = np.abs(lm_model.coef_)
    print(coefs_df.sort_values('abs_coefs', ascending=False))
       
    return lm_model

In [5]:
'''
 make dataframe from neighborhood_list
'''
def cat_counts(df,col):
    df[col].value_counts()
    room_dict = df[col].value_counts().to_dict()
    room_list=[]
    for key, value in room_dict.items():
        temp = [key,value]
        room_list.append(temp)
    dff = pd.DataFrame(room_list, columns=[col,'Count'])
    dff['Perc'] = dff['Count']/np.sum(dff['Count'])
    
    return dff