#All script by Ryan Holbrook, Data Scientist (Kaggle Deep Learning Course - Exercise: Stochastic Gradient Descent)

![](https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcScFPlPBq5g8eSPa07gsjbAA1xLTUjdUh-56A&usqp=CAU)slideshare.net

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as py
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from colorama import Fore, Style

nRowsRead = 1000 # specify 'None' if want to read whole file
# ham_lyrics.csv has 3634 rows in reality, but we are only loading/previewing the first 1000 rows
df = pd.read_csv('../input/hackathon/task_2-COVID-19-death_cases_per_country_after_first_death-till_22_September_2020.csv', delimiter=',', nrows = nRowsRead)
df.dataframeName = 'task_2-COVID-19-death_cases_per_country_after_first_death-till_22_September_2020.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')
print(Fore.YELLOW + 'Data shape: ',Style.RESET_ALL,df.shape)
df.head()

In [None]:
# Let's find the null values in data

total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

#Handling Missing Values

In [None]:
# categorical features with missing values
categorical_nan = [feature for feature in df.columns if df[feature].isna().sum()>0 and df[feature].dtypes=='O']
print(categorical_nan)

In [None]:
# replacing missing values in categorical features
for feature in categorical_nan:
    df[feature] = df[feature].fillna('None')

In [None]:
df[categorical_nan].isna().sum()

In [None]:
# Lets handle numerical features with nan value
numerical_nan = [feature for feature in df.columns if df[feature].isna().sum()>1 and df[feature].dtypes!='O']
numerical_nan

In [None]:
## Replacing the numerical Missing Values

for feature in numerical_nan:
    ## We will replace by using median since there are outliers
    median_value=df[feature].median()
    
    df[feature].fillna(median_value,inplace=True)
    
df[numerical_nan].isnull().sum()

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.model_selection import train_test_split

#fuel = pd.read_csv('../input/dl-course-data/fuel.csv')

X = df.copy()
# Remove target
y = X.pop('stringency_index_10_days_after_first_death')

preprocessor = make_column_transformer(
    (StandardScaler(),
     make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(sparse=False),
     make_column_selector(dtype_include=object)),
)

X = preprocessor.fit_transform(X)
y = np.log(y) # log transform target instead of standardizing

input_shape = [X.shape[1]]
print("Input shape: {}".format(input_shape))

In [None]:
df.head()
# Uncomment to see processed features
pd.DataFrame(X[:10,:]).head()

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=input_shape),
    layers.Dense(128, activation='relu'),    
    layers.Dense(64, activation='relu'),
    layers.Dense(1),
])

In [None]:
model.compile(
    optimizer='adam',
    loss='mae'
)


In [None]:
history = model.fit(
    X, y,
    batch_size=128,
    epochs=200
)

In [None]:
import pandas as pd

history_df = pd.DataFrame(history.history)
# Start the plot at epoch 5. You can change this to get a different view.
history_df.loc[5:, ['loss']].plot();

In [None]:
# Setup plotting
import matplotlib.pyplot as plt
from learntools.deep_learning_intro.dltools import animate_sgd
plt.style.use('seaborn-whitegrid')
# Set Matplotlib defaults
plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=18, titlepad=10)
plt.rc('animation', html='html5')

# Setup feedback system
from learntools.core import binder
binder.bind(globals())
from learntools.deep_learning_intro.ex3 import *

In [None]:
import matplotlib.pyplot as plt
from learntools.deep_learning_intro.dltools import animate_sgd

# YOUR CODE HERE: Experiment with different values for the learning rate, batch size, and number of examples
learning_rate = 0.05
batch_size = 32
num_examples = 256

animate_sgd(
    learning_rate=learning_rate,
    batch_size=batch_size,
    num_examples=num_examples,
    # You can also change these, if you like
    steps=50, # total training steps (batches seen)
    true_w=3.0, # the slope of the data
    true_b=2.0, # the bias of the data
)

In [None]:
ls ../input/hackathon/task_1-google_search_txt_files_v2/UZ/

In [None]:
Uzbekistan = '../input/hackathon/task_1-google_search_txt_files_v2/UZ/Uzbekistan-en-result-113-original.txt'

In [None]:
text = open(Uzbekistan, 'r',encoding='utf-8',
                 errors='ignore').read()

In [None]:
print(text[:2500])

#Tuberculosis: A Global Threat

"Despite nearly a century of use, the Bacille Calmette-Guérin (BCG) vaccine continues to be controversial, with known variations in BCG substrains and vaccine efficacy.

Because vaccination policies and practices vary across time and countries, they created the first searchable, online, open access database of global BCG vaccination policy and practices, the BCG World Atlas (http://www.bcgatlas.org/), which contains detailed information on current and past BCG policies and practices for over 180 countries.

The Atlas is for clinicians, policymakers, and researchers and provides information that may be helpful for better interpretation of tuberculosis (TB) diagnostics as well as design of new TB vaccines." 

Those words are on the text (.txt) above.

In [None]:
#Code by Olga Belitskaya https://www.kaggle.com/olgabelitskaya/sequential-data/comments
from IPython.display import display,HTML
c1,c2,f1,f2,fs1,fs2=\
'#eb3434','#eb3446','Akronim','Smokum',30,15
def dhtml(string,fontcolor=c1,font=f1,fontsize=fs1):
    display(HTML("""<style>
    @import 'https://fonts.googleapis.com/css?family="""\
    +font+"""&effect=3d-float';</style>
    <h1 class='font-effect-3d-float' style='font-family:"""+\
    font+"""; color:"""+fontcolor+"""; font-size:"""+\
    str(fontsize)+"""px;'>%s</h1>"""%string))
    
    
dhtml('Thanks Ryan Holbrook, @mpwolke Was Here.' )