In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
%matplotlib inline
CreditData = pd.read_csv("Credit.csv")
CreditData.head(10)

Unnamed: 0.1,Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
1,2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
2,3,104.593,7075,514,4,71,11,Male,No,No,Asian,580
3,4,148.924,9504,681,3,36,11,Female,No,No,Asian,964
4,5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331
5,6,80.18,8047,569,4,77,10,Male,No,No,Caucasian,1151
6,7,20.996,3388,259,2,37,12,Female,No,No,African American,203
7,8,71.408,7114,512,2,87,9,Male,No,No,Asian,872
8,9,15.125,3300,266,5,66,13,Female,No,No,Caucasian,279
9,10,71.061,6819,491,3,41,19,Female,Yes,Yes,African American,1350


In [2]:
del CreditData['Unnamed: 0']

#### Let's look at correlation matrix. This time, we only explore the quantitative variables that affect Credit Balance. From your preliminary analysis, which 3 variables seem to affect Balance the most? If our goal is interpretation; should we use these 3 variables simultaneously? Why?

In [3]:
CreditData.corr()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Balance
Income,1.0,0.792088,0.791378,-0.018273,0.175338,-0.027692,0.463656
Limit,0.792088,1.0,0.99688,0.010231,0.100888,-0.023549,0.861697
Rating,0.791378,0.99688,1.0,0.053239,0.103165,-0.030136,0.863625
Cards,-0.018273,0.010231,0.053239,1.0,0.042948,-0.051084,0.086456
Age,0.175338,0.100888,0.103165,0.042948,1.0,0.003619,0.001835
Education,-0.027692,-0.023549,-0.030136,-0.051084,0.003619,1.0,-0.008062
Balance,0.463656,0.861697,0.863625,0.086456,0.001835,-0.008062,1.0


Answer: Income, Limit and Rating seem to affect Balance most. No. We need to test if these three variable are associated with each other.

#### There are few categorical variables, let's first create dummy variables for them


In [4]:
RaceDummy = pd.get_dummies(CreditData.Ethnicity, prefix = 'Race')
del RaceDummy['Race_African American']

# I did one for you. Complete the rest
GenderDummy = pd.get_dummies(CreditData.Gender, prefix = 'Gender')
del GenderDummy['Gender_ Male']
MarriedDummy = pd.get_dummies(CreditData.Married, prefix = 'Married')
del MarriedDummy['Married_No']
StudentDummy = pd.get_dummies(CreditData.Student, prefix = 'Student')
del StudentDummy['Student_No']
# Concatenate dummy variables with old data
CreditData = pd.concat([CreditData, RaceDummy,GenderDummy,MarriedDummy,StudentDummy], axis=1)

CreditData.head()
# If you have done it correctly, there should be a total of 16 columns

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance,Race_Asian,Race_Caucasian,Gender_Female,Married_Yes,Student_Yes
0,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333,0,1,0,1,0
1,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903,1,0,1,1,1
2,104.593,7075,514,4,71,11,Male,No,No,Asian,580,1,0,0,0,0
3,148.924,9504,681,3,36,11,Female,No,No,Asian,964,1,0,1,0,0
4,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331,0,1,0,1,0


# Now it's time for some fun!

#### By a regression line, use Education, Ethnicity, Gender, Age, Cards, and Income to predict Balance. 

First Step, find the coefficients of your regression line (print them out)

In [5]:
# Use sklearn
#input the library
from sklearn.linear_model import LinearRegression 
linreg = LinearRegression()
X = CreditData[['Education','Race_Asian','Race_Caucasian','Gender_Female','Age','Cards','Income']]
y = CreditData['Balance']
linreg.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [6]:
print(linreg.intercept_)
print(linreg.coef_)

230.04235439275942
[ 1.64553607 -6.54603078  3.47497641 27.12543123 -2.32970547 33.62953508
  6.27995894]


Second Step, find the p-values of your estimates. You have a few variables; try to show your p-values along side the names of the variables.

In [7]:
# Use statsmodels, refit the regression model and print the p-values
X_con = sm.add_constant(X)
model = sm.OLS(y, X_con).fit()
#model.summary()
print(model.summary())
print("P-Vales: ", list(zip(['Education','Race_Asian','Race_Caucasian','Gender_Female','Age','Cards','Income'], 
                      model.pvalues[1:8])))

                            OLS Regression Results                            
Dep. Variable:                Balance   R-squared:                       0.232
Model:                            OLS   Adj. R-squared:                  0.219
Method:                 Least Squares   F-statistic:                     16.95
Date:                Mon, 02 Mar 2020   Prob (F-statistic):           1.41e-19
Time:                        19:55:00   Log-Likelihood:                -2966.5
No. Observations:                 400   AIC:                             5949.
Df Residuals:                     392   BIC:                             5981.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const            230.0424    130.247      1.

  return ptp(axis=axis, out=out, **kwargs)


#### Which of your coefficients are significant at significance level 5%?

Answer: Cards and Income are significant.

#### What is the R-Squared of your model?

In [8]:
# R Squared
linreg.score(X,y)

0.23231260833540443

#### How do we interpret this value?

Answer: The model can't significantly reflex the data. 

#### Now focus on two of the most significant variables from your previous model and re-run your regression model. What are the coefficients and the p-values?

In [9]:
X = CreditData[['Cards','Income']]
#import formula
import statsmodels.formula.api as smf
lm = smf.ols(formula='y ~ X', data=CreditData).fit() #Notice here we don't need to add the intercept term ourselves.
lm.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.224
Model:,OLS,Adj. R-squared:,0.22
Method:,Least Squares,F-statistic:,57.3
Date:,"Mon, 02 Mar 2020",Prob (F-statistic):,1.3800000000000001e-22
Time:,19:55:00,Log-Likelihood:,-2968.6
No. Observations:,400,AIC:,5943.0
Df Residuals:,397,BIC:,5955.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,151.3299,55.289,2.737,0.006,42.634,260.026
X[0],31.8381,14.826,2.147,0.032,2.691,60.985
X[1],6.0710,0.577,10.525,0.000,4.937,7.205

0,1,2,3
Omnibus:,39.698,Durbin-Watson:,1.975
Prob(Omnibus):,0.0,Jarque-Bera (JB):,19.444
Skew:,0.359,Prob(JB):,5.99e-05
Kurtosis:,2.194,Cond. No.,160.0


#### In comparison to the previous model, did our R-Squared increase or decrease? Why do you think that happened?

In [10]:
# R Squared
lm.rsquared

0.22399175162249518

Answer: R-Square decrease, because we have less input variables this time, which means the data we use can't help us predict the model.

#### Now let's regress Balance on Gender alone. After running your regression lines, do you have enough evidence to claim that females having more balance than males? (Hint: Look at the p-value of the Gender coefficient. If it is significant then you will have evidence to support that claim, otherwise you cannot support the statement).

In [11]:
X = GenderDummy
lm = smf.ols(formula='y ~ X', data=CreditData).fit() 
lm.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.1836
Date:,"Mon, 02 Mar 2020",Prob (F-statistic):,0.669
Time:,19:55:00,Log-Likelihood:,-3019.3
No. Observations:,400,AIC:,6043.0
Df Residuals:,398,BIC:,6051.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,509.8031,33.128,15.389,0.000,444.675,574.931
X,19.7331,46.051,0.429,0.669,-70.801,110.267

0,1,2,3
Omnibus:,28.438,Durbin-Watson:,1.94
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27.346
Skew:,0.583,Prob(JB):,1.15e-06
Kurtosis:,2.471,Cond. No.,2.66


Answer: (If your answer is yes, interpret the results)

#### Now let's regress Balance on Ethnicity. After running your regression lines, do you have enough evidence to claim that some ethnic groups carry more balance than others? (Hint: Look at the p-value of  your dummy variables. If it is significant then you will have evidence to support that claim, otherwise you cannot support that statement).

In [12]:
X = RaceDummy
lm = smf.ols(formula='y ~ X', data=CreditData).fit() 
lm.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.005
Method:,Least Squares,F-statistic:,0.04344
Date:,"Mon, 02 Mar 2020",Prob (F-statistic):,0.957
Time:,19:55:00,Log-Likelihood:,-3019.3
No. Observations:,400,AIC:,6045.0
Df Residuals:,397,BIC:,6057.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,531.0000,46.319,11.464,0.000,439.939,622.061
X[0],-18.6863,65.021,-0.287,0.774,-146.515,109.142
X[1],-12.5025,56.681,-0.221,0.826,-123.935,98.930

0,1,2,3
Omnibus:,28.829,Durbin-Watson:,1.946
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27.395
Skew:,0.581,Prob(JB):,1.13e-06
Kurtosis:,2.46,Cond. No.,4.39


Answer: (If your answer is yes, interpret the results)

#### I know you get tired of this but for the last time regress Balance on Studentship status. After running your regression lines, do you have enough evidence to claim that students  carry more balance than others? (Hint: Look at the p-value of the your dummy variables. If it is significant then you will have evidence to support that claim, otherwise you cannot support the statement).


In [13]:
X = StudentDummy
lm = smf.ols(formula='y ~ X', data=CreditData).fit() 
lm.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.067
Model:,OLS,Adj. R-squared:,0.065
Method:,Least Squares,F-statistic:,28.62
Date:,"Mon, 02 Mar 2020",Prob (F-statistic):,1.49e-07
Time:,19:55:00,Log-Likelihood:,-3005.5
No. Observations:,400,AIC:,6015.0
Df Residuals:,398,BIC:,6023.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,480.3694,23.434,20.499,0.000,434.300,526.439
X,396.4556,74.104,5.350,0.000,250.771,542.140

0,1,2,3
Omnibus:,20.866,Durbin-Watson:,1.95
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21.92
Skew:,0.544,Prob(JB):,1.74e-05
Kurtosis:,2.637,Cond. No.,3.37


Answer: (If your answer is yes, interpret the results). 
Yes. There is a linear relationship between Balance and Studentship Status. (0 Yes 1 No) If the person is not a student, it has a negative influence on the credit balance.

#### Now let's consider effect of students and income on balance simultaneously. Let's start with a regression line.

In [15]:
X =  CreditData[['Student_Yes','Income']]
lm = smf.ols(formula='y ~ X', data=CreditData).fit() 
lm.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.277
Model:,OLS,Adj. R-squared:,0.274
Method:,Least Squares,F-statistic:,76.22
Date:,"Mon, 02 Mar 2020",Prob (F-statistic):,9.640000000000001e-29
Time:,19:57:54,Log-Likelihood:,-2954.4
No. Observations:,400,AIC:,5915.0
Df Residuals:,397,BIC:,5927.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,211.1430,32.457,6.505,0.000,147.333,274.952
X[0],382.6705,65.311,5.859,0.000,254.272,511.069
X[1],5.9843,0.557,10.751,0.000,4.890,7.079

0,1,2,3
Omnibus:,119.719,Durbin-Watson:,1.951
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23.617
Skew:,0.252,Prob(JB):,7.44e-06
Kurtosis:,1.922,Cond. No.,192.0


#### Are all of our regression coefficients significant? If yes, interpret them.

Answer: Yes, both coefficients are significant. We find that fixing income, students on average tend to carry 382 dollars more balance. Also, on average higher income earners tend to carry more balance on their credit cards. For every 1000 dollars additional income, people on average carry around 6 dollars more balance.

#### Now let's explore interaction between income and studentship. Let's start with a regression line

In [17]:
# First generate a column for interation term
CreditData['Income_Student'] = CreditData['Income'] * CreditData['Student_Yes']
CreditData.head(2)

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance,Race_Asian,Race_Caucasian,Gender_Female,Married_Yes,Student_Yes,Income_Student
0,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333,0,1,0,1,0,0.0
1,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903,1,0,1,1,1,106.025


In [18]:
# Now fit the regression model
X =  CreditData[['Student_Yes','Income','Income_Student']]
lm = smf.ols(formula='y ~ X', data=CreditData).fit() 
lm.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.28
Model:,OLS,Adj. R-squared:,0.274
Method:,Least Squares,F-statistic:,51.3
Date:,"Mon, 02 Mar 2020",Prob (F-statistic):,4.94e-28
Time:,19:59:01,Log-Likelihood:,-2953.7
No. Observations:,400,AIC:,5915.0
Df Residuals:,396,BIC:,5931.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,200.6232,33.698,5.953,0.000,134.373,266.873
X[0],476.6758,104.351,4.568,0.000,271.524,681.827
X[1],6.2182,0.592,10.502,0.000,5.054,7.382
X[2],-1.9992,1.731,-1.155,0.249,-5.403,1.404

0,1,2,3
Omnibus:,107.788,Durbin-Watson:,1.952
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22.158
Skew:,0.228,Prob(JB):,1.54e-05
Kurtosis:,1.941,Cond. No.,309.0


#### Are our coefficients signifincant? Write down your regression line below:

Answer: No. 

Balance_Hat = 200.62 + 476.67 * Student_Dummy + 6.21 * Income - 1.999 * Income * Student_Dummy

#### Assume all coefficients in above regression were significant. Is there any income level at which students and non-students on average carry same level of balance (you will need to do some math here)?

Answer: When Income = 476.6758 / 1.9992 = 238.4333


  
Theoretically, at 238.433 income, these two groups carry the same amount of balance. Since this range of income is higher than observed values for students, it is safe to say that within the range of our observations, students on average carry more balance. 

We interpret the results this way. We say, students on average carry 476.67 dollars more than non-students. But for every 1000 dollars they make this difference between the balance that students and non-students are carrying is decreased by around 2 dollars. 