#ASSIGNMENT - A3(Part C)
# **Performing Tobit regression analysis on NSSO68 Dataset**

* **AUTHOR**     : Rakshitha Vignesh Sargurunathan               

* **VID**        : V01109007

* **CREATED ON** : 07/01/2024

This project involves the analysis of a dataset using a Tobit regression model. The Tobit model
is particularly suitable for datasets with censored data, where the dependent variable is limited
or censored at a certain value. The objective is to understand the relationship between
household expenditure and various socio-economic factors.

In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import norm
from statsmodels.base.model import GenericLikelihoodModel
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/SCMA/A1a/NSSO68.csv")
df.head()

Unnamed: 0,slno,grp,Round_Centre,FSU_number,Round,Schedule_Number,Sample,Sector,state,State_Region,...,pickle_v,sauce_jam_v,Othrprocessed_v,Beveragestotal_v,foodtotal_v,foodtotal_q,state_1,Region,fruits_df_tt_v,fv_tot
0,1,4.099999999999999e+31,1,41000,68,10,1,2,24,242,...,0.0,0.0,0.0,0.0,1141.4924,30.942394,GUJ,2,12.0,154.18
1,2,4.099999999999999e+31,1,41000,68,10,1,2,24,242,...,0.0,0.0,0.0,17.5,1244.5535,29.286153,GUJ,2,333.0,484.95
2,3,4.099999999999999e+31,1,41000,68,10,1,2,24,242,...,0.0,0.0,0.0,0.0,1050.3154,31.527046,GUJ,2,35.0,214.84
3,4,4.099999999999999e+31,1,41000,68,10,1,2,24,242,...,0.0,0.0,0.0,33.333333,1142.591667,27.834607,GUJ,2,168.333333,302.3
4,5,4.099999999999999e+31,1,41000,68,10,1,2,24,242,...,0.0,0.0,0.0,75.0,945.2495,27.600713,GUJ,2,15.0,148.0


In [4]:
# Data preprocessing
# Check for missing values
print(df.isnull().sum())

slno              0
grp               0
Round_Centre      0
FSU_number        0
Round             0
                 ..
foodtotal_q       0
state_1           0
Region            0
fruits_df_tt_v    0
fv_tot            0
Length: 384, dtype: int64


In [5]:
# Choosing dependent and independent variables
dependent_var = 'MPCE_URP'
independent_vars = ['hhdsz', 'Age', 'Sex', 'Education']

In [6]:
# Prepare the data for regression
X = df[independent_vars]
y = df[dependent_var]

In [8]:
X = X.dropna()
y = y.loc[X.index]

# Add a constant to the independent variables matrix
X = sm.add_constant(X)

In [9]:
# Check for and remove any inf values
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.dropna(inplace=True)

# Check and ensure no NaN values remain
if X.isnull().any().any():
    raise ValueError("There are still NaN values in the independent variables matrix.")

# Check for and remove any inf values
y.replace([np.inf, -np.inf], np.nan, inplace=True)
y.dropna(inplace=True)

# Check and ensure no NaN values remain
if y.isnull().any().any():
    raise ValueError("There are still NaN values in the dependent variable.")

# Ensure dependent variable matches the cleaned X
y = y.loc[X.index]

In [10]:
# Define Tobit model
class TobitModel(GenericLikelihoodModel):
    def __init__(self, endog, exog, left=None, right=None, **kwargs):
        self.left = left
        self.right = right
        super(TobitModel, self).__init__(endog, exog, **kwargs)

    def nloglikeobs(self, params):
        exog = self.exog
        endog = self.endog
        left = self.left
        right = self.right

        beta = params[:-1]
        sigma = params[-1]
        XB = np.dot(exog, beta)

        ll = np.zeros(len(endog))

        if left is not None:
            cdf_left = (endog <= left).astype(int)
            ll += cdf_left * np.log(1 - norm.cdf((left - XB) / sigma))

        if right is not None:
            cdf_right = (endog >= right).astype(int)
            ll += cdf_right * np.log(norm.cdf((right - XB) / sigma))

        uncensored = np.ones(len(endog), dtype=bool)
        if left is not None:
            uncensored &= (endog > left)
        if right is not None:
            uncensored &= (endog < right)

        ll[uncensored] = (norm.logpdf((endog[uncensored] - XB[uncensored]) / sigma) - np.log(sigma))

        return -ll

    def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwargs):
        if start_params is None:
            start_params = np.append(np.zeros(self.exog.shape[1]), 1)
        return super(TobitModel, self).fit(start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwargs)

In [11]:
# Set left censoring at 0 (lower bound)
left_censoring = 0

# Fit the Tobit model
model = TobitModel(y, X, left=left_censoring)
results = model.fit()

Optimization terminated successfully.
         Current function value: 9.750560
         Iterations: 716
         Function evaluations: 1151


In [12]:
# Print the summary of the regression results
print(results.summary())

                              TobitModel Results                              
Dep. Variable:               MPCE_URP   Log-Likelihood:            -9.9119e+05
Model:                     TobitModel   AIC:                         1.982e+06
Method:            Maximum Likelihood   BIC:                         1.982e+06
Date:                Mon, 01 Jul 2024                                         
Time:                        11:26:13                                         
No. Observations:              101655                                         
Df Residuals:                  101650                                         
Df Model:                           4                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -19.3490     74.697     -0.259      0.796    -165.752     127.055
hhdsz         25.5786      6.143      4.164      0.0