---
### - Downloading Needed Libraries : 
---

In [11]:
! pip install statsmodels
! pip install pandas
! pip install numpy
! pip install statsmodels pandas numpy

  pid, fd = os.forkpty()




---
### - Needed libraries : 
---

In [12]:
import pandas as pd
import statsmodels.api as sm
from scipy import stats
import seaborn as sns
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

---
### - Reading the Datasets from its Path : 
---

In [13]:
# Load the datasets 
data = pd.read_csv('/kaggle/input/covid-19-case-surveillance/data.csv')
data_1= pd.read_csv('/kaggle/input/household-pulse-survey/pulse_puf_most_recent.csv')

  data = pd.read_csv('/kaggle/input/covid-19-case-surveillance/data.csv')


---
### - Testing The Datasets : 
---

In [14]:
 print(data.head(1))
 # print(data_1.head(1))

  case_month res_state  state_fips_code res_county  county_fips_code  \
0    2020-12        MN             27.0   HENNEPIN           27053.0   

        age_group     sex     race        ethnicity  \
0  18 to 49 years  Female  Unknown  Hispanic/Latino   

   case_positive_specimen_interval  case_onset_interval  process exposure_yn  \
0                              0.0                  NaN  Missing     Missing   

              current_status symptom_status  hosp_yn   icu_yn death_yn  \
0  Laboratory-confirmed case        Missing  Missing  Missing       No   

  underlying_conditions_yn  
0                      NaN  


---
### -Part 4 || Regression Analysis : 
---

In [15]:
# Filter out necessary columns
columns_needed = [ 'case_month', 'sex', 'age_group', 'icu_yn', 'hosp_yn', 'death_yn']
data = data[columns_needed]

# Define age groups for consistency
age_groups = ['0 - 17 years', '18 to 49 years', '50 to 64 years', '65+ years']

# Clean and transform the data
data['death_yn'] = data['death_yn'].map({'Yes': 1, 'No': 0})
data['hosp_yn'] = data['hosp_yn'].map({'Yes': 1, 'No': 0})
data['icu_yn'] = data['icu_yn'].map({'Yes': 1, 'No': 0})
data['sex'] = data['sex'].fillna('Unknown')
data['age_group'] = data['age_group'].fillna('Unknown')

# Group by case_month and calculate proportions
monthly_data = data.groupby('case_month').agg(percent_deaths=('death_yn', 'mean'),percent_icu=('icu_yn', 'mean'),percent_hospitalized=('hosp_yn', 'mean'),  percent_female=('sex', lambda x: (x == 'Female').mean()),percent_male=('sex', lambda x: (x == 'Male').mean()),percent_age_group_0_17=('age_group', lambda x: (x == '0 - 17 years').mean()), percent_age_group_18_49=('age_group', lambda x: (x == '18 to 49 years').mean()), percent_age_group_50_64=('age_group', lambda x: (x == '50 to 64 years').mean()),percent_age_group_65_plus=('age_group', lambda x: (x == '65+ years').mean())).reset_index()

# Prepare the independent variables
X = monthly_data[['percent_female', 'percent_male', 'percent_age_group_0_17','percent_age_group_18_49', 'percent_age_group_50_64','percent_age_group_65_plus', 'percent_icu', 'percent_hospitalized']]

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Dependent variable
y = monthly_data['percent_deaths']

# Fit the regression model
model = sm.OLS(y, X).fit()

# Print the summary of the regression
print('The Summary fo Regression Model :')
print(model.summary())
print('#########################################################################')

# Part1: Extract coefficients and p-values
coefficients = model.params
p_values = model.pvalues
results = pd.DataFrame({'Coefficient': coefficients,'P-value': p_values})
print('Part [1] : Coefficients && P-value')
print(results)
print('#########################################################################')


# Part 2: Determing whether the variables are good predictors or not 
significance_level = 0.05
good_predictors = results[results['P-value'] < significance_level]
bad_predictors = results[results['P-value'] >= significance_level]
# Display the results
print("Part [2] :  Good Predictors (P-value < 0.05):")
print(good_predictors)
print('#########################################################################')
print("\nPart [2] : Bad Predictors (P-value >= 0.05):")
print(bad_predictors)
print('#########################################################################')


# part 3: Check for multicollinearity
correlation_matrix = X.corr()
print('Part [3] : Correlation Matrix : ')
print(correlation_matrix)
print('#########################################################################')


# part 4: Experiment with different ways to improve the fit and interpretability : 
# techniques : 
#technique (1) Add or Remove Intercept 
print('Part [4] : Experiment with different ways to improve the fit and interpretability techniques :  ')
# Prepare the 4independent variables without adding a constant
X_no_intercept = monthly_data[['percent_female', 'percent_male', 'percent_age_group_0_17','percent_age_group_18_49', 'percent_age_group_50_64','percent_age_group_65_plus', 'percent_icu', 'percent_hospitalized']]
# Fit the regression model without an intercept
model_no_intercept = sm.OLS(y, X_no_intercept).fit()
# Print the summary of the regression without intercept
print('summary of the regression without intercept : ')
print(model_no_intercept.summary())
print('#########################################################################')


#technique (2) Add squared terms to the independent variables
X_with_higher_order = X.copy()
X_with_higher_order['percent_icu_squared'] = X['percent_icu'] ** 2
X_with_higher_order['percent_hospitalized_squared'] = X['percent_hospitalized'] ** 2
# Fit the regression model with higher-order terms
model_higher_order = sm.OLS(y, X_with_higher_order).fit()
# Print the summary of the regression with higher-order terms
print('summary of the regression with higher-order terms :')
print(model_higher_order.summary())
print('#########################################################################')



#technique (3) Fit the initial regression model to identify outliers
initial_model = sm.OLS(y, X).fit()
# Calculate residuals
residuals = initial_model.resid
# Define a threshold for outliers, for example, 3 standard deviations from the mean
threshold = 3 * residuals.std()
# Filter out outliers
df_no_outliers = monthly_data[abs(residuals) < threshold]
# Prepare the independent variables without outliers
X_no_outliers = sm.add_constant(df_no_outliers[['percent_female', 'percent_male',  'percent_age_group_0_17', 'percent_age_group_18_49','percent_age_group_50_64', 'percent_age_group_65_plus', 'percent_icu', 'percent_hospitalized']])
# Dependent variable without outliers
y_no_outliers = df_no_outliers['percent_deaths']
# Fit the regression model without outliers
model_no_outliers = sm.OLS(y_no_outliers, X_no_outliers).fit()
# Print the summary of the regression without outliers
print('summary of the regression without outliers :')
print(model_no_outliers.summary())
print('#########################################################################')


The Summary fo Regression Model :
                            OLS Regression Results                            
Dep. Variable:         percent_deaths   R-squared:                       0.869
Model:                            OLS   Adj. R-squared:                  0.844
Method:                 Least Squares   F-statistic:                     34.91
Date:                Thu, 23 May 2024   Prob (F-statistic):           3.78e-16
Time:                        17:11:30   Log-Likelihood:                 149.38
No. Observations:                  51   AIC:                            -280.8
Df Residuals:                      42   BIC:                            -263.4
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

---
---
---

### - Bonus Part : 
---

---
#### - Logistic Regression : 
---

In [16]:
# Extract features and target
X_ml = monthly_data[['percent_female', 'percent_male', 'percent_age_group_0_17',
                     'percent_age_group_18_49', 'percent_age_group_50_64',
                     'percent_age_group_65_plus', 'percent_icu', 'percent_hospitalized']]
y_ml = (monthly_data['percent_deaths'] > 0).astype(int)

# Split data into training and testing sets
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_ml, y_ml, test_size=0.2, random_state=42)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_ml, y_train_ml)

# Predictions
y_pred_log_reg = log_reg.predict(X_test_ml)

# Evaluation
print("Logistic Regression Classification Report")
print(classification_report(y_test_ml, y_pred_log_reg))
print(f"Accuracy: {accuracy_score(y_test_ml, y_pred_log_reg)}")
print('#########################################################################')


Logistic Regression Classification Report
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

Accuracy: 1.0
#########################################################################


#### - Random Forest Classifier : 

In [17]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_ml, y_train_ml)
# Predictions
y_pred_rf = rf_clf.predict(X_test_ml)
# Evaluation
print("Random Forest Classification Report")
print(classification_report(y_test_ml, y_pred_rf))
print(f"Accuracy: {accuracy_score(y_test_ml, y_pred_rf)}")
print('#########################################################################')


Random Forest Classification Report
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

Accuracy: 1.0
#########################################################################


#### -  Neural Network :

In [18]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train_ml.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_ml, y_train_ml, epochs=20, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_ml, y_test_ml)
print(f"Neural Network Accuracy: {accuracy}")
print('#########################################################################')


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.0000e+00 - loss: 0.7351 - val_accuracy: 0.1250 - val_loss: 0.7019
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.0312 - loss: 0.7070 - val_accuracy: 0.8750 - val_loss: 0.6802
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 1.0000 - loss: 0.6804 - val_accuracy: 0.8750 - val_loss: 0.6598
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 1.0000 - loss: 0.6549 - val_accuracy: 0.8750 - val_loss: 0.6404
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 1.0000 - loss: 0.6306 - val_accuracy: 0.8750 - val_loss: 0.6230
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 1.0000 - loss: 0.6082 - val_accuracy: 0.8750 - val_loss: 0.6075
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3