# Question3

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler


# Load the diabetes data (assuming you have the dataset in an excel file)
diabetes_data = pd.read_excel("Diabetes_Data.xlsx")

# Extract the explanatory variables (features) from the dataset
explanatory_variables = diabetes_data.iloc[:, :-1]

# Calculate the correlation matrix
correlation_matrix = explanatory_variables.corr()

# print correlation_matrix
print(correlation_matrix)

In [None]:
# Create a heatmap of the correlation matrix
plt.figure(figsize=(6, 4))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap of Explanatory Variables")
plt.show()

# Relationships between Variables

The heatmap will visually represent the correlations between explanatory variables. Positive correlations are indicated by warmer colors (red), while negative correlations are shown in cooler colors (blue). A correlation close to 1 or -1 indicates a strong linear relationship between variables, while a correlation near 0 suggests a weak or no relationship. Analyzing the heatmap helps to identify which variables are strongly correlated with each other.

In [None]:
# Fit a linear model using Scikit-Learn
X = explanatory_variables
y = diabetes_data['Y']
model1 = LinearRegression().fit(X, y)
y_pred = model1.predict(X)

# Calculate MSE
mse = mean_squared_error(y, y_pred)

# Calculate adjusted R-squared using statsmodels
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
adj_r2 = model.rsquared_adj

# Check significance of variables
print("Adjusted R2: ", adj_r2)
print("\nMean Squared Error: ", mse)
print()
print(model.summary())

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector
# define SequentialFeatureSelector
linear = LinearRegression()
selection_feature = SequentialFeatureSelector(linear, k_features='best', forward=True, scoring='neg_mean_squared_error')
selection_feature.fit(X, y)
selection_feature

In [None]:
significant_feature = list(X.columns[list(selection_feature.k_feature_idx_)])
print(significant_feature)

# Build model2 with only significant estimators

In [None]:
# Fit a linear model using Scikit-Learn
significant_feature = diabetes_data.iloc[:, 1:-2]

X = significant_feature
y = diabetes_data['Y']
model2 = LinearRegression().fit(X, y)
y_pred = model2.predict(X)

# Calculate MSE
mse = mean_squared_error(y, y_pred)

X = sm.add_constant(X)
model2 = sm.OLS(y, X).fit()
adj_r2 = model2.rsquared_adj

print("Adjusted R2: ", adj_r2)
print("\nMean Squared Error: ", mse)

# Question 4

In [None]:
# Load the Titanic dataset
titanic_data = pd.read_csv("titanic3.csv")

# Calculate the probability of survival
survival_probability = titanic_data['survived'].mean()
print("Probability of Survival:", survival_probability)

In [None]:
#set the age interval
age_interval = [0, 18, 80]
label = ['0-18', '19-80']
titanic_data['Age_Group'] = pd.cut(titanic_data['age'], bins=age_interval, labels=label)

#group data by passenger class, gender, and age, and calculate the mean survival for each group
survival_table = titanic_data.groupby(["pclass", "sex", "Age_Group"])["survived"].mean()

#display the table
print(survival_table)

In [None]:
#prepare the data
X = titanic_data[["pclass", "sex", "age"]].copy()
y = titanic_data["survived"]

# encode categorical variables (sex) as binary
X["sex"] = X["sex"].map({"female": 0, "male": 1})

# Handle missing values in the age column by filling with the mean
X["age"].fillna(X["age"].mean(), inplace=True)

# Split the data into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# remove the outliers
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)

# Build and fit the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Print the model coefficients (parameter estimates)
coefficients = model.coef_
intercept = model.intercept_
print("Model Coefficients (Parameters):")
print("Intercept (Bias):", intercept)
print("Coefficient for pclass:", coefficients[0, 0])
print("Coefficient for sex:", coefficients[0, 1])
print("Coefficient for age:", coefficients[0, 2])

In [None]:
# Standard errors of coefficients
coef_standard_errors = np.sqrt(np.diag(np.linalg.inv(np.dot(X_train.T, X_train))))

# Calculate Wald statistics for each coefficient
wald_stats = model.coef_ / coef_standard_errors

# Calculate two-tailed p-values for each coefficient
p_values = 2 * (1 - stats.norm.cdf(np.abs(wald_stats)))

# Define a significance level (e.g., 0.05)
alpha = 0.05

# Determine the number of variables
num_variables = len(p_values[0])

# Check which coefficients are statistically significant
significant_variables = []
for i in range(num_variables):
    if p_values[0, i] < alpha:
        significant_variables.append("Yes")
    else:
        significant_variables.append("No")

# Print the results
print("Wald Statistics:")
for i in range(num_variables):
    print(f"Coefficient for {X.columns[i]}:", wald_stats[0, i])

print("\nP-values:")
for i in range(num_variables):
    print(f"P-value for {X.columns[i]}:", p_values[0, i])

print("\nStatistically Significant Variables:")
for i in range(num_variables):
    print(f"Is {X.columns[i]} significant?", significant_variables[i])

# performance of the model

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate classification metrics
print("Confusion Matrix:")
print(conf_matrix)

# Calculate accuracy
accuracy = (conf_matrix[0, 0] + conf_matrix[1, 1]) / np.sum(conf_matrix)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)