<a href="https://colab.research.google.com/github/Panchagiri-vidyasri/data-analysis-using-python_2116/blob/main/2203a52116_daup_lab12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from scipy.stats import ttest_1samp, chi2_contingency
from statsmodels.stats.weightstats import ztest

# Load dataset
df = pd.read_csv("/content/Student_Mental_Stress_and_Coping_Mechanisms.csv")

df.dropna(inplace=True)

# Encode categorical variables
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
target_col = None
for col in df.columns:
    if "stress" in col.lower():
        target_col = col
        break

if target_col is None:
    raise KeyError("Mental stress level column not found in dataset. Check dataset structure.")

X = df.drop(columns=[target_col])
y = df[target_col]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC()
}

# Evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1-score": f1_score(y_test, y_pred, average='weighted')
    }

# Print results
results_df = pd.DataFrame(results).T
print(results_df)

# Feature importance from Random Forest
rf = models["Random Forest"]
importances = rf.feature_importances_
important_features = pd.Series(importances, index=X.columns).sort_values(ascending=False)
print("Top 3 most important features:")
print(important_features.head(3))

# PCA for dimensionality reduction
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print("Explained variance ratio:", pca.explained_variance_ratio_)





                     Accuracy  Precision    Recall  F1-score
Logistic Regression  0.210526   0.196597  0.210526  0.198562
Random Forest        0.157895   0.168216  0.157895  0.157918
SVM                  0.164474   0.169806  0.164474  0.160504
Top 3 most important features:
Student ID              0.097560
Study Hours Per Week    0.089673
Age                     0.068711
dtype: float64
Explained variance ratio: [0.99183725 0.0066955 ]


In [18]:
# One-sample T-test
t_stat, p_value = ttest_1samp(y, 5)
print("T-test p-value:", p_value)

sample1 = np.random.normal(50, 10, 100)
sample2 = np.random.normal(52, 10, 100)
z_stat, p_value = ztest(sample1, sample2)
print(f"Z-Test: Z-stat = {z_stat:.2f}, p-value = {p_value:.4f}")

T-test p-value: 2.905158136943706e-175
Z-Test: Z-stat = -1.05, p-value = 0.2934


In [20]:
import scipy.stats as stats
import numpy as np

def chi_square_test(observed):

    observed = np.array(observed)
    chi2_stat, p_value, dof, expected = stats.chi2_contingency(observed)
    return chi2_stat, p_value, dof, expected

# Example usage:
observed_data = [[50, 30], [20, 40]]  # Example contingency table
chi2_stat, p_value, dof, expected = chi_square_test(observed_data)
print(f"Chi-Square Statistic: {chi2_stat}")
print(f"P-Value: {p_value}")
print(f"Degrees of Freedom: {dof}")
print(f"Expected Frequencies: \n{expected}")


Chi-Square Statistic: 10.529166666666667
P-Value: 0.0011750518530845063
Degrees of Freedom: 1
Expected Frequencies: 
[[40. 40.]
 [30. 30.]]
