In [3]:
import seaborn as sns
import time
import warnings
warnings.filterwarnings("ignore")


# list the dataset names
# sns.get_dataset_names()

# Load dataset from sns ( here tips dataset )
df = sns.load_dataset("tips")

# Take the columns total_bill and tip only.
df = df.loc[:,['total_bill','tip']]
df.head()

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.5
3,23.68,3.31
4,24.59,3.61


In [4]:
# Take total_bill as X (independent var)
# Take tip as y (dependent var)

#-------------------------------------------------------
# Add constant here
import statsmodels.api as sm
X = sm.add_constant( df['total_bill'] )
# X.head(5)

# 	   const total_bill
# 0	    1.0	 16.99
# 1  	1.0	 10.34
# 2  	1.0  21.01
# 3	    1.0  23.68
# 4  	1.0	 24.59
#-------------------------------------------------------

Y = df['tip']

#=================================================================

# Splitting the Dataset into Training and Validation Sets

from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split( X,
                                                    Y,
                                                    train_size = 0.8,
                                                    random_state = 100 )
# shape of train_X and test_X
print("Shape of train_X = ",train_X.shape)
print("Shape of test_X = ",test_X.shape)
#=================================================================

# Fitting the Model

df_fitted_model = sm.OLS( train_y, train_X ).fit()

#=================================================================

# Printing Estimated Parameters and Interpreting Them

print( df_fitted_model.params )

#=================================================================

Shape of train_X =  (195, 2)
Shape of test_X =  (49, 2)
const         1.034947
total_bill    0.096732
dtype: float64


In [5]:
df_fitted_model.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.396
Dependent Variable:,tip,AIC:,558.2813
Date:,2020-11-20 20:05,BIC:,564.8273
No. Observations:,195,Log-Likelihood:,-277.14
Df Model:,1,F-statistic:,128.1
Df Residuals:,193,Prob (F-statistic):,4.1699999999999994e-23
R-squared:,0.399,Scale:,1.015

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,1.0349,0.1806,5.7314,0.0000,0.6788,1.3911
total_bill,0.0967,0.0085,11.3182,0.0000,0.0799,0.1136

0,1,2,3
Omnibus:,24.579,Durbin-Watson:,2.035
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49.068
Skew:,0.608,Prob(JB):,0.0
Kurtosis:,5.136,Condition No.:,53.0


# R-Square is 1 Example

### Create a dataset, X and Y are same

In [7]:
# creating a dataset
import random 
X = []
for i in range(100):
    X.append(random.randint(1,1000))
# create a dataframe
df = pd.DataFrame({"X":X,
                   "Y":X})
df.head()

Unnamed: 0,X,Y
0,992,992
1,299,299
2,238,238
3,66,66
4,527,527


In [8]:
import statsmodels.api as sm
X = sm.add_constant( df['X'] )
Y = df['Y']

from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split( X,
                                                    Y,
                                                    train_size = 0.8,
                                                    random_state = 100 )

df_fitted_model = sm.OLS( train_y, train_X ).fit()

df_fitted_model.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,1.0
Dependent Variable:,Y,AIC:,-4498.3294
Date:,2020-11-20 20:11,BIC:,-4493.5654
No. Observations:,80,Log-Likelihood:,2251.2
Df Model:,1,F-statistic:,3.0769999999999998e+32
Df Residuals:,78,Prob (F-statistic):,0.0
R-squared:,1.000,Scale:,2.1718e-26

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,-0.0000,0.0000,-5.0248,0.0000,-0.0000,-0.0000
X,1.0000,0.0000,17541405122860152.0000,0.0000,1.0000,1.0000

0,1,2,3
Omnibus:,0.037,Durbin-Watson:,0.072
Prob(Omnibus):,0.982,Jarque-Bera (JB):,13.316
Skew:,-0.049,Prob(JB):,0.001
Kurtosis:,1.004,Condition No.:,1188.0
