#### Basic Data Exploration

In [None]:
# Importing Dataset and Libraries
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from scipy.stats import ttest_ind
from sklearn.metrics import accuracy_score
df = pd.read_csv('https://raw.githubusercontent.com/ASHOKKUMAR-K/Weight_Prediction/master/weight-height.csv',index_col=False)


In [None]:
#Checking the shape
df.shape

(10000, 3)

In [None]:
#Checking data set
df.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [None]:
df.tail()

Unnamed: 0,Gender,Height,Weight
9995,Female,66.172652,136.777454
9996,Female,67.067155,170.867906
9997,Female,63.867992,128.475319
9998,Female,69.034243,163.852461
9999,Female,61.944246,113.649103


In [None]:
#Checking for Null Values
df.isnull().values.sum()

0

#### Building Linear Model

In [None]:
#Assigning Dependent and Independent values
X=df[['Gender','Height']]
Y=df['Weight']

# Converting Gender values  into Numeric Value
X=df.replace({'Gender':{'Male':0 ,'Female':1}})


#### Q1.1 - Report the coefficients values by using the standard Least Square Estimates

In [None]:
# Fitting the Model
model= sm.OLS(Y,X).fit()
summary=model.summary()

#### #Q1.2 - What is the standard error of the estimated coefficients, R-squared term, and the 95% confidence interval?

In [None]:
#Standard Error of Estimated coefficients
standard_errors=model.bse
print(standard_errors)

Gender    2.617452e-15
Height    1.570643e-16
Weight    5.744886e-17
dtype: float64


In [None]:
# Calculating R_Squared Error for model
r_squared_term = model.rsquared
print(f'The R-Squared term for model is :{r_squared_term}')

The R-Squared term for model is :1.0


In [None]:
# Calculating 95% Confidence of model
CI=model.conf_int(alpha=0.05)
print(f"95% confidence interval of model is {CI} ")

95% confidence interval of model is                    0             1
Gender -1.401339e-13 -1.298724e-13
Height  1.357457e-15  1.973212e-15
Weight  1.000000e+00  1.000000e+00 


#### Q1.3 -Is there any dependence between the gender and weight of the subject?

In [None]:
# separate the weight data by gender
male_weights = df[df['Gender'] == 'Male']['Weight']
female_weights = df[df['Gender'] == 'Female']['Weight']

# perform the t-test
t_stat, p_value = ttest_ind(male_weights, female_weights, equal_var=False)

# print the results
print("t-statistic:", t_stat)
print("p-value:", p_value)


t-statistic: 131.8195276508186
p-value: 0.0


### Yes, there is a dependence between gender and weight of the subject, as indicated by the significant coefficient for the gender variable in the linear regression model. The coefficient suggests that, on average, males weigh more than females after controlling for height. Additionally, the p-value associated with the gender coefficient is less than the significance level of 0.05, indicating that the effect of gender on weight is statistically significant. Therefore, we can conclude that gender is a significant predictor of weight in this model. The t-statistic is very high and the p-value is close to zero, indicating that there is a significant difference in weight between males and females in the dataset.

####Q1.4 -  Explain how are the qualitative attributes (such as gender) included in the regression model?

Qualitative attributes, also known as categorical variables, are included in a regression model through the use of dummy variables. A dummy variable is a binary variable that takes on the value of 1 if the categorical variable is present, and 0 otherwise.

In the case of the gender variable, we can create a dummy variable that takes on the value of 1 for males and 0 for females, or vice versa. This dummy variable is then included in the regression model as an independent variable alongside the other continuous variables.

By including the dummy variable in the regression model, we are essentially estimating separate regression lines for each category of the categorical variable. For example, in our weight prediction model, including a gender dummy variable allows us to estimate separate regression lines for males and females.

When interpreting the coefficients of the regression model, we can interpret the coefficient associated with the dummy variable as the difference in the mean response variable between the two categories, all other variables being equal. In our weight prediction model, the gender coefficient represents the difference in mean weight between males and females, with height being held constant.

It is important to note that when including dummy variables in a regression model, we need to leave out one category as a reference category. This is because including all categories as separate dummy variables will result in perfect multicollinearity, which can cause issues with estimation and interpretation of coefficients.

## Q2 Using the data source in Q1 fit the Ridge and Lasso Regression Models.

In [None]:
#Splitting data into Train Test using
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.2)

In [None]:
# Fit the Lasso regression model
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, Y_train)

Lasso(alpha=0.1)

In [None]:
#Fitting Lasso Regression Model
ridge_model= Ridge(alpha=0.1)
ridge_model.fit(X_train, Y_train)

Ridge(alpha=0.1)

#### Q2.1 - Report the coefficients for both the models

In [None]:
# Report the coefficients for Ridge and Lasso regression models
ridge_coefs = pd.DataFrame(data=ridge_model.coef_, index=X_train.columns, columns=['Ridge Coefficients'])
lasso_coefs = pd.DataFrame(data=lasso_model.coef_, index=X_train.columns, columns=['Lasso Coefficients'])
print("Ridge regression model coefficients: ")
print(ridge_coefs)
print("\nLasso regression model coefficients: ")
print(lasso_coefs)


Ridge regression model coefficients: 
        Ridge Coefficients
Gender       -2.436045e-06
Height        7.478465e-07
Weight        9.999999e-01

Lasso regression model coefficients: 
        Lasso Coefficients
Gender           -0.000000
Height            0.000000
Weight            0.999903


#### Q.2.2 Report the attribute (among height and gender) least impacting the weight of the subject.

In [None]:
# Extracting P values to check less impacting subject.

In [None]:
# Extract the p-values
p_values = model.summary2().tables[1]['P>|t|']
# Print the p-values
print(f"P-values:\n{p_values}")

P-values:
Gender    0.000000e+00
Height    3.976155e-26
Weight    0.000000e+00
Name: P>|t|, dtype: float64


#### As we can see Height has higher P value which means it is less impactful.

## Q3 Using the data source in Q1, fit Logistic Regression model to predict gender of the subject based on height and weight of the subject. Please use 90% of the data for training and rest 10% for evaluation using predict method

In [None]:
# Split data into train and test sets using 90% of Data for Training and 10 % for Testing.
X1_train,X1_test,Y1_train,Y1_test= train_test_split(X,Y,test_size=0.1)

# Fitting the logistic regression model
# Fitting the Model
reg_model= sm.OLS(Y,X).fit()
# Predict on test set
y_pred = reg_model.predict(X_test)
