In [None]:
import pandas as pd #data preprocessing
import numpy as np #linear algebra 

In [None]:
import warnings ## to avoid warning messages
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# Load the Data

In [None]:
covid=pd.read_csv("../input/covid19-case-surveillance-public-use-dataset/COVID-19_Case_Surveillance_Public_Use_Data.csv")

# To check if any null values

In [None]:
covid.isnull().sum()

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt

In [None]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
%%time
covid.head()

## **Shape of the dataset**

In [None]:
covid.shape

# To check unique values in each columns of the dataset

In [None]:
covid.nunique()

# drop columns which are not significant

In [None]:
covid= covid.drop(['pos_spec_dt','onset_dt'],axis =1)

# drop rows having missing values

In [None]:
covid=covid.dropna()

In [None]:
covid.isnull().sum() ##check if any null values present

In [None]:
covid.shape

In [None]:
covid.describe()

In [None]:
colums = ['current_status', 'sex', 'age_group', 'Race and ethnicity (combined)', 'hosp_yn','icu_yn', 'death_yn', 'medcond_yn']
for col in colums:
    print(col)
    print(covid[colums].value_counts())
    print("______________________")

In [None]:
covid.describe().T

In [None]:
#Unique values in data
covid.nunique()

# **Some Interesting Insights from Visualization**

In [None]:
plt.figure(figsize=(30,10))
plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9,
                      wspace=0.5, hspace=0.2)
plt.subplot(141)
plt.title('medcond_yn ',fontsize = 20)
covid['medcond_yn'].value_counts().plot.pie(autopct="%1.1f%%")

In [None]:
plt.figure(figsize=(30,10))
plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9,
                      wspace=0.5, hspace=0.2)
plt.subplot(141)
plt.title('death_yn',fontsize = 20)
covid['death_yn'].value_counts().plot.pie(autopct="%1.1f%%")

In [None]:
plt.figure(figsize=(30,10))
plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9,
                      wspace=0.5, hspace=0.2)
plt.subplot(141)
plt.title('hosp_yn',fontsize = 20)
covid['hosp_yn'].value_counts().plot.pie(autopct="%1.1f%%")

In [None]:
plt.figure(figsize=(30,10))
plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9,
                      wspace=0.5, hspace=0.2)
plt.subplot(141)
plt.title('icu_yn',fontsize = 20)
covid['icu_yn'].value_counts().plot.pie(autopct="%1.1f%%")

In [None]:
plt.figure(figsize=(30,10))
plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9,wspace=0.5, hspace=0.2)
plt.subplot(141)
plt.title('Race and ethnicity (combined)',fontsize = 20)
covid['Race and ethnicity (combined)'].value_counts().plot.pie(autopct="%1.1f%%")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.figure(figsize=(20, 50))
sns.heatmap(covid.isnull(), cbar=False)

In [None]:
print("Start Date:", covid['cdc_report_dt'].min())
print("End Date:", covid['cdc_report_dt'].max())

In [None]:
covid['age_group'].value_counts()

# Data Preparation For Model

In [None]:
data = covid.copy()

In [None]:
data['sex'].value_counts()

# Converting categorical feature to numeric¶

In [None]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()

In [None]:
data['sex'].value_counts()

**converting Sex feature to a new feature called Gender where female=0 and male=1**

In [None]:
# mapp = {'Female':1,'Male':2,'Unknown':3,'Missing':4,'Other':5}
# data['sex'] = data['sex'].apply(lambda x:mapp[x])

data['sex'] = data['sex'].map({'Female':0,'Male':1,'Unknown':2,'Missing':3,'Other':4})
print(data.head())


In [None]:
data["current_status"] = lb_make.fit_transform(data["current_status"])
data["hosp_yn"] = lb_make.fit_transform(data["hosp_yn"])
data["icu_yn"] = lb_make.fit_transform(data["icu_yn"])
data["death_yn"] = lb_make.fit_transform(data["death_yn"])
data["medcond_yn"] = lb_make.fit_transform(data["medcond_yn"])

In [None]:
data.head()

In [None]:
covid.shape

**Define X and y**

In [None]:
X =  data[['current_status','hosp_yn','icu_yn','medcond_yn','sex']]
y = data['death_yn']

**Train test split**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Logistic regresssion

In [None]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, y_train) * 100, 2)
acc_log

#XG Boost

In [None]:
##XGBoost
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

model = XGBClassifier()
model.fit(X_train, y_train)


In [None]:
Y_pred = model.predict(X_test)
acc_log = round(model.score(X_train, y_train) * 100, 2)
acc_log

#Decision tree

In [None]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, y_train) * 100, 2)
acc_decision_tree