### Statistics Question 2

#### Consider “heart.xls” dataset

In [21]:
import pandas as pd
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_excel("C:\\Users\\warshape\\Desktop\\heart.xlsx")
df

#### a) Consider the entire dataset. Is there any difference between the presence of heart disease (no/less chance / more chance) of female patients and male patients? (use a statistical test)

In [22]:
# Create a contingency table for sex vs. target
contingency_table = pd.crosstab(df['sex'], df['target'])
# Perform Chi-Square Test
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")
print("Significant relationship between gender and heart disease." if p < 0.05 else "No significant relationship.")

In [23]:
# Display Contingency Table, Row Percentages, and Standardized Residuals
print(f"🔶 Contingency Table \n{contingency_table}\n")
print(f"🔶 Row Percentages (%) \n{(contingency_table.div(contingency_table.sum(axis=1), axis=0) * 100).round(2)}\n")
print(f"🔶 Standardized Residuals \n{((contingency_table - expected) / (expected**0.5)).round(2)}\n")

#### b) Do you think normalization or standardization techniques would help to draw meaningful insights from the dataset? If so, what are those?

In [24]:
from scipy.stats import shapiro

stat, p_value = shapiro(df)
print(f"Shapiro-Wilk Test: W={stat}, p-value={p_value}")

In [25]:
# Identify numerical columns
numerical_columns = df.select_dtypes(include=['number']).columns
# Check whether its continuous
continuous_columns = []

for col in numerical_columns:
    # If the column has more than a set threshold of unique values, it is likely continuous
    if df[col].nunique() > 20:  # you can adjust the threshold
        continuous_columns.append(col)
        
# Display the continuous columns
print("Continuous Numerical Columns:")
print(continuous_columns)

In [26]:
# Standardize selected columns
cols_to_standardize = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Apply StandardScaler ## Z-score standardization
scaler = StandardScaler()
df_standardized = df.copy()
df_standardized[cols_to_standardize] = scaler.fit_transform(df[cols_to_standardize])

In [27]:
df_standardized ##### the standardized dataset

In [28]:
# Verify standardization
df_standardized[cols_to_standardize].describe().loc[['mean', 'std']]

Insights that we can gather from plotting after standardizing

In [29]:
import seaborn as sns
# Feature Comparison: Scatter Plot Matrix
sns.pairplot(df_standardized[['age', 'trestbps', 'chol', 'thalach', 'oldpeak']])
plt.suptitle('Feature Comparison After Standardization', y=1.02)
plt.show()

In [30]:
# Correlation: Heatmap
correlation_matrix = df_standardized[['age', 'trestbps', 'chol', 'thalach', 'oldpeak']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix After Standardization')
plt.show()

In [31]:
# 3.Boxplot 
sns.boxplot(data=df_standardized[['age', 'trestbps', 'chol', 'thalach', 'oldpeak']])
plt.title('Boxplot of Features After Standardization')
plt.xticks(rotation=45)
plt.show()

In [32]:
# 4. Clustering: K-Means Clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
df_standardized['Cluster'] = kmeans.fit_predict(df_standardized[['age', 'trestbps', 'chol', 'thalach', 'oldpeak']])


In [33]:
# Scatter plot for clustering visualization (2D projection for simplicity)
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df_standardized['age'], y=df_standardized['thalach'], hue=df_standardized['Cluster'], palette='Set1', marker='o')
plt.title('K-Means Clustering of Patients (Age vs Thalach)')
plt.show()

#### c) Perform suitable regression analysis and develop a predictive model. Clearly mention the model, feature selection method and variables in the model. Further, interpret the model parameters.

In [36]:
# Define independent variables (features)
X = df_standardized.drop(columns=['target'])  # Drop target column to keep only features
print('independent variables:', X.columns.tolist())

# Define dependent variable (target)
y = df_standardized['target']  
print('\ndependent variable:', y.name)

In [37]:
#from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Initialize logistic regression model
logreg = LogisticRegression()

# Apply RFE for feature selection
rfe = RFE(logreg, n_features_to_select=5)  # Select the top 5 most relevant features
X_selected = rfe.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[rfe.support_]
print("Selected Features:", selected_features)


In [38]:
print("Shape before feature selection:", X.shape)
print("Shape after feature selection:", X_selected.shape)

In [39]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [40]:
# Initialize and train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [35]:
# Extract model coefficients
coefficients = model.coef_[0]

# Create a DataFrame to display coefficients
coef_df = pd.DataFrame({'Feature': selected_features, 'Coefficient': coefficients})

# Display coefficients
print(coef_df)

#### d) Assess the model's predictive power using appropriate evaluation tools. 

In [34]:
#### 1. Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

#### 2. Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)

#### 3. Precision, Recall, and F1-Score
print("\nClassification Report:\n", classification_report(y_test, y_pred))

#### 4. ROC-AUC Score
auc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print(f"AUC Score: {auc_score:.2f}")

### Statistics Question 3

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro, f_oneway

# Load the dataset
file_path = "C:\\Users\\warshape\\Desktop\\techgear_sales.xlsx"  # Update with your actual file path
df = pd.read_excel(file_path)
df

#### a) Create a histogram of purchase amounts and comment on the shape of the distribution

In [3]:
###  Histogram of Purchase Amounts ###
plt.figure(figsize=(10, 5))
sns.histplot(df['purchase_amount'], bins=30, kde=True, color='skyblue')
plt.xlabel("Purchase Amount (USD)")
plt.ylabel("Frequency")
plt.title("Histogram of Purchase Amounts")
plt.show()

#### b) What is the probability that a randomly selected purchase was made during a promotional period?

In [4]:
### Probability of Purchase During a Promotional Period ###
promo_probability = df['promo_period'].mean()
print(f"Probability of purchase during a promotional period: {promo_probability:.2f}")

#### c) Calculate the conditional probability that a purchase amount exceeds 1000 dollars given that it was made during a promotional period.

In [5]:
### Conditional Probability P(purchase_amount > 1000 | promo_period = 1) ###
promo_purchases = df[df['promo_period'] == 1]
prob_exceeds_1000_given_promo = (promo_purchases['purchase_amount'] > 1000).mean()
print(f"Conditional Probability P(Purchase > 1000 | Promo): {prob_exceeds_1000_given_promo:.2f}")

#### d) Test whether purchase amounts follow a normal distribution using an appropriate statistical test. State your null and alternative hypotheses and interpret the results at a 5% significance level.

In [6]:
### Test for Normality (Shapiro-Wilk Test) ###
stat, p_value = shapiro(df['purchase_amount'])
alpha = 0.05  # 5% significance level

print("Normality Test Results:")
print(f"Test Statistic: {stat:.4f}, P-Value: {p_value:.4f}")

if p_value > alpha:
    print("Fail to reject H0: Data appears to be normally distributed.")
else:
    print("Reject H0: Data does not follow a normal distribution.")

#### e) Create age groups for customers (18-25, 26-35, 36-45, 46+) and calculate the mean purchase amount for each group. Conduct a one-way ANOVA test to determine if there are significant differences in purchase amounts across age groups.

In [7]:

### ANOVA Test for Purchase Amounts Across Age Groups ###
# Define age groups
bins = [18, 25, 35, 45, float('inf')]
labels = ["18-25", "26-35", "36-45", "46+"]
df['age_group'] = pd.cut(df['customer_age'], bins=bins, labels=labels, right=True)

# Calculate mean purchase amount for each age group (fixing the warning)
mean_purchase_per_group = df.groupby('age_group', observed=False)['purchase_amount'].mean()
print("Mean Purchase Amount per Age Group:")
print(mean_purchase_per_group)

# ANOVA test
grouped_data = [df[df['age_group'] == age]['purchase_amount'] for age in labels]
anova_stat, anova_p = f_oneway(*grouped_data)

print("\nANOVA Test Results:")
print(f"Test Statistic: {anova_stat:.4f}, P-Value: {anova_p:.4f}")

if anova_p < alpha:
    print("Reject H0: Significant differences exist in purchase amounts across age groups.")
else:
    print("Fail to reject H0: No significant differences in purchase amounts across age groups.")