In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

### Pandas profiling is an open source Python module with which we can quickly do an exploratory data analysis with just a few lines of code. Besides, if this is not enough to convince us to use this tool, it also generates interactive reports in web format that can be presented to any person, even if they don’t know programming.
### In short, what pandas profiling does is save us all the work of visualizing and understanding the distribution of each variable. It generates a report with all the information easily available.

In [None]:
from pandas_profiling import ProfileReport

## Generates profile reports from a pandas DataFrame. The pandas df.describe() function is great but a little basic for serious exploratory data analysis. pandas_profiling extends the pandas DataFrame with df.profile_report() for quick data analysis.

### For each column the following statistics - if relevant for the column type - are presented in an interactive HTML report:

* Type inference: detect the types of columns in a dataframe.
* Essentials: type, unique values, missing values
* Quantile statistics like minimum value, Q1, median, Q3, maximum, range, interquartile range
* Descriptive statistics like mean, mode, standard deviation, sum, median absolute deviation, coefficient of variation, kurtosis, skewness
* Most frequent values
* Histogram
* Correlations highlighting of highly correlated variables, Spearman, Pearson and Kendall matrices
* Missing values matrix, count, heatmap and dendrogram of missing values
* Text analysis learn about categories (Uppercase, Space), scripts (Latin, Cyrillic) and blocks (ASCII) of text data.

In [None]:
# train_profile = ProfileReport(xtrain, title='Pandas Profiling Report', html={'style':{'full_width':True}})
# train_profile
# Load Data
xtrain = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-3/train.csv')
xtest = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-3/test.csv')
xsubmission = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-3/submission.csv')
xtrain.rename(columns={'Country_Region':'Country'}, inplace=True)
xtest.rename(columns={'Country_Region':'Country'}, inplace=True)

xtrain.rename(columns={'Province_State':'State'}, inplace=True)
xtest.rename(columns={'Province_State':'State'}, inplace=True)
xtrain.State=xtrain.State.fillna('NA')
xtest.State=xtest.State.fillna('NA')
xtrain['Date'] = pd.to_datetime(xtrain['Date'], infer_datetime_format=True)
xtest['Date'] = pd.to_datetime(xtest['Date'], infer_datetime_format=True)

for j in range(14):
    train_lag=xtrain.groupby(['Country','State']).shift(periods=j+1)

    xtrain['lag_'+str(j+1)+'_ConfirmedCases']=train_lag['ConfirmedCases'].fillna(0)
    xtrain['lag_'+str(j+1)+'_Fatalities']=train_lag['Fatalities'].fillna(0)



In [None]:
xtrain.ConfirmedCases

In [None]:
xtrain.rename(columns={'Country_Region':'Country'}, inplace=True)
xtest.rename(columns={'Country_Region':'Country'}, inplace=True)

xtrain.rename(columns={'Province_State':'State'}, inplace=True)
xtest.rename(columns={'Province_State':'State'}, inplace=True)

xtrain['Date'] = pd.to_datetime(xtrain['Date'], infer_datetime_format=True)
xtest['Date'] = pd.to_datetime(xtest['Date'], infer_datetime_format=True)

xtrain.info()
xtest.info()
for_y=xtest.merge(xtrain,on=['State','Country','Date'],how='inner')
y1_xTrain = xtrain.ConfirmedCases
y1_xTrain.head()
y1_xTest = for_y.ConfirmedCases
y1_xTest.head()
y2_xTrain = xtrain.Fatalities
y2_xTrain.head()
y2_xTest = for_y.Fatalities
y2_xTest.head()
EMPTY_VAL = "NA"

def fillState(state, country):
    if state == EMPTY_VAL: return country
    return state



In [None]:
y2_xTest

In [None]:
X_xTrain = xtrain.copy()

X_xTrain['State'].fillna(EMPTY_VAL, inplace=True)


X_xTrain['State'] = X_xTrain.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1)
X_xTrain.loc[:, 'Date_ACT'] = X_xTrain.Date
X_xTrain.loc[:, 'Date'] = X_xTrain.Date.dt.strftime("%m%d")
X_xTrain["Date"]  = X_xTrain["Date"].astype(int)

print(X_xTrain.head())

#X_Test = df_test.loc[:, ['State', 'Country', 'Date']]
X_xTest = for_y.copy()

X_xTest['State'].fillna(EMPTY_VAL, inplace=True)
X_xTest['State'] = X_xTest.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1)

X_xTest.loc[:, 'Date'] = X_xTest.Date.dt.strftime("%m%d")
X_xTest["Date"]  = X_xTest["Date"].astype(int)

print(X_xTest.head())

**Fit Data with Model**

In [None]:
from sklearn import preprocessing

lec = preprocessing.LabelEncoder()
les = preprocessing.LabelEncoder()
X_xTrain.Country = lec.fit_transform(X_xTrain.Country)
X_xTrain['State'] = les.fit_transform(X_xTrain['State'])

print(X_xTrain.head())

X_xTest.Country = lec.transform(X_xTest.Country)
X_xTest['State'] = les.transform(X_xTest['State'])

print(X_xTest.head())


xtrain.loc[xtrain.Country == 'Afghanistan', :]
print(xtest.tail())

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn import preprocessing

le = preprocessing.LabelEncoder()

from xgboost import XGBRegressor

countries = X_xTrain.Country.unique()


In [None]:
X_xTest

## Predict data and Create submission file from test data

In [None]:
# # Predict data and Create submission file from test data
# xout = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []})

# from sklearn.model_selection import GridSearchCV, train_test_split
# from xgboost import XGBRegressor
# k_log_col=['lag_'+str(j+1)+'_ConfirmedCases' for j in range(14)]+['lag_'+str(j+1)+'_Fatalities' for j in range(14)]
# from sklearn.metrics import r2_score
# params = {'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,6)],  'subsample':[i/10.0 for i in range(6,8)],
# 'colsample_bytree':[i/10.0 for i in range(6,8)], 'max_depth': [3,4,5],'n_estimators':[500,1000],'learning_rate': [.03, 0.05, .07],
#      'objective':['reg:squaredlogerror']}
# # params = {'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,6)],  'subsample':[i/10.0 for i in range(6,7)],
# # 'colsample_bytree':[i/10.0 for i in range(6,10)], 'max_depth': [2,3,4],'n_estimators':[500,1000],
# #      'objective':['reg:squaredlogerror']}
# params = {'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,4,5)],  'subsample':[i/10.0 for i in range(6,7)],
# 'colsample_bytree':[i/10.0 for i in range(6,7)], 'max_depth': [2,3,4],'n_estimators':[1000],
#      'objective':['reg:squaredlogerror']}
# def RMSLE(pred,actual):
#     return np.sqrt(np.mean(np.power((np.log(pred+1)-np.log(actual+1)),2)))
# import time
# for country in countries[:1]:
#     st_time=time.time()
#     states = X_xTrain.loc[X_xTrain.Country == country, :].State.unique()
#     #print(country, states)
#     # check whether string is nan or not
#     for state in states:
#         print(lec.inverse_transform([country]), les.inverse_transform([state]) )
#         X_xTrain_CS = X_xTrain.loc[(X_xTrain.Country == country) & (X_xTrain.State == state), ['State', 'Country', 'Date','Date_ACT', 'ConfirmedCases', 'Fatalities']+k_log_col]
        
#         #print(X_xTrain_CS.Country.unique())
#         y1_xTrain_CS = X_xTrain_CS.loc[:, 'ConfirmedCases']
#         y2_xTrain_CS = X_xTrain_CS.loc[:, 'Fatalities']
        
#         X_xTrain_CS1 = X_xTrain_CS.loc[:, ['State', 'Country']+k_log_col]
#         le1=preprocessing.LabelEncoder().fit(X_xTrain_CS1.Country)
#         le2=preprocessing.LabelEncoder().fit(X_xTrain_CS1['State'])
#         X_xTrain_CS1.Country = le1.transform(X_xTrain_CS1.Country)
#         X_xTrain_CS1['State'] = le2.transform(X_xTrain_CS1['State'])
        
#         X_xTest_CS = X_xTest.loc[(X_xTest.Country == country) & (X_xTest.State == state), ['State', 'Country', 'Date', 'ForecastId']+k_log_col]
#         y1_xTest_CS = for_y[(X_xTest.Country == country) & (X_xTest.State == state)].iloc[:, -2]
#         y2_xTest_CS = for_y[(X_xTest.Country == country) & (X_xTest.State == state)].iloc[:, -1]
#         X_xTest_CS_Id = X_xTest_CS.loc[:, 'ForecastId']
#         X_xTest_CS1 = X_xTest_CS.loc[:, ['State', 'Country']+k_log_col]
        
#         X_xTest_CS1.Country = le1.transform(X_xTest_CS.Country)
#         X_xTest_CS1['State'] = le2.transform(X_xTest_CS['State'])
        
#         #models_C[country] = gridSearchCV(model, X_Train_CS, y1_Train_CS, param_grid, 10, 'neg_mean_squared_error')
#         #models_F[country] = gridSearchCV(model, X_Train_CS, y2_Train_CS, param_grid, 10, 'neg_mean_squared_error')
        

#         xgb1 = XGBRegressor(nthread=-1) 

#         grid = GridSearchCV(xgb1, params)
#         grid.fit(X_xTrain_CS1, y1_xTrain_CS)

#         # Print the r2 score
#         print(r2_score(y1_xTest_CS, grid.best_estimator_.predict(X_xTest_CS1))) 
#         print(RMSLE(y1_xTest_CS, grid.best_estimator_.predict(X_xTest_CS1))) 

#         # Save the file
#         y1_xpred = grid.best_estimator_.predict(X_xTest_CS1)
        
# #         xmodel2 = XGBRegressor(n_estimators=1000)
# #         xmodel2.fit(X_xTrain_CS, y2_xTrain_CS)
# #         y2_xpred = xmodel2.predict(X_xTest_CS)
#         xgb2 = XGBRegressor(nthread=-1)
#         grid1 = GridSearchCV(xgb2, params)
#         grid1.fit(X_xTrain_CS1, y2_xTrain_CS)

#         # Print the r2 score
#         print(r2_score(y2_xTest_CS, grid1.best_estimator_.predict(X_xTest_CS1))) 
#         print(RMSLE(y2_xTest_CS, grid1.best_estimator_.predict(X_xTest_CS1))) 
#         # Save the file
#         y2_xpred = grid1.best_estimator_.predict(X_xTest_CS1)
        
#         xdata = pd.DataFrame({'ForecastId': X_xTest_CS_Id, 'ConfirmedCases': y1_xpred, 'Fatalities': y2_xpred})
#         xout = pd.concat([xout, xdata], axis=0)
#         print('Total time for 1 run ' + str(time.time()-st_time))


In [None]:
# Predict data and Create submission file from test data
xout = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []})

from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBRegressor
k_log_col=['lag_'+str(j+1)+'_ConfirmedCases' for j in range(14)]+['lag_'+str(j+1)+'_Fatalities' for j in range(14)]
from sklearn.metrics import r2_score
params = {'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,6)],  'subsample':[i/10.0 for i in range(6,8)],
'colsample_bytree':[i/10.0 for i in range(6,8)], 'max_depth': [3,4,5],'n_estimators':[500,1000],'learning_rate': [.03, 0.05, .07],
     'objective':['reg:squaredlogerror']}
# params = {'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,6)],  'subsample':[i/10.0 for i in range(6,7)],
# 'colsample_bytree':[i/10.0 for i in range(6,10)], 'max_depth': [2,3,4],'n_estimators':[500,1000],
#      'objective':['reg:squaredlogerror']}
params = {'min_child_weight':[4,5],  'max_depth': [6],'n_estimators':[1000]}
def RMSLE(pred,actual):
    return np.sqrt(np.mean(np.power((np.log(pred+1)-np.log(actual+1)),2)))
import time

X_xTrain_CS = X_xTrain.loc[:, ['State', 'Country', 'Date','Date_ACT', 'ConfirmedCases', 'Fatalities']+k_log_col]

#print(X_xTrain_CS.Country.unique())
y1_xTrain_CS = X_xTrain_CS[X_xTrain_CS['ConfirmedCases'] - X_xTrain_CS['lag_1_ConfirmedCases'] >0]['ConfirmedCases']-X_xTrain_CS[X_xTrain_CS['ConfirmedCases'] - X_xTrain_CS['lag_1_ConfirmedCases'] >0]['lag_1_ConfirmedCases']
y2_xTrain_CS = X_xTrain_CS[X_xTrain_CS['ConfirmedCases'] - X_xTrain_CS['lag_1_ConfirmedCases'] >0]['Fatalities']-X_xTrain_CS[X_xTrain_CS['ConfirmedCases'] - X_xTrain_CS['lag_1_ConfirmedCases'] >0]['lag_1_Fatalities']

#X_xTrain_CS1 = X_xTrain_CS.loc[:, ['State', 'Country']+k_log_col]
X_xTrain_CS1=X_xTrain_CS[(X_xTrain_CS['ConfirmedCases'] - X_xTrain_CS['lag_1_ConfirmedCases']) >0].loc[:, ['State', 'Country']+k_log_col].reset_index(drop=True)
oh1=preprocessing.OneHotEncoder(sparse=False).fit(X_xTrain_CS.Country.values.reshape(X_xTrain_CS.Country.shape[0],1))
oh2=preprocessing.OneHotEncoder(sparse=False).fit(X_xTrain_CS.State.values.reshape(X_xTrain_CS.Country.shape[0],1))


all_val=oh1.transform(X_xTrain_CS1.Country.values.reshape(X_xTrain_CS1.Country.shape[0],1))

col=['cnty_'+str(k) for k in range(all_val.shape[1])]
X_xTrain_CS1=pd.concat([X_xTrain_CS1,pd.DataFrame(all_val,columns=col)],axis=1)


all_val_State=oh2.transform(X_xTrain_CS1.State.values.reshape(X_xTrain_CS1.State.shape[0],1))

col_State=['state_'+str(k) for k in range(all_val_State.shape[1])]
X_xTrain_CS1=pd.concat([X_xTrain_CS1,pd.DataFrame(all_val_State,columns=col_State)],axis=1)
# X_xTrain_CS1.Country = le1.transform(X_xTrain_CS1.Country)
# X_xTrain_CS1['State'] = le2.transform(X_xTrain_CS1['State'])

X_xTest_CS = X_xTest.loc[:, ['State', 'Country', 'Date', 'ForecastId']+k_log_col]
y1_xTest_CS = for_y.loc[:, 'ConfirmedCases']-for_y.loc[:, 'lag_1_ConfirmedCases']
y2_xTest_CS = for_y.loc[:, 'Fatalities']-for_y.loc[:, 'lag_1_Fatalities']
X_xTest_CS_Id = X_xTest_CS.loc[:, 'ForecastId']
X_xTest_CS1 = X_xTest_CS.loc[:, ['State', 'Country']+k_log_col]
all_val=oh1.transform(X_xTest_CS1.Country.values.reshape(X_xTest_CS1.Country.shape[0],1))

col=['cnty_'+str(k) for k in range(all_val.shape[1])]
X_xTest_CS1=pd.concat([X_xTest_CS1,pd.DataFrame(all_val,columns=col)],axis=1)


all_val_State=oh2.transform(X_xTest_CS1.State.values.reshape(X_xTest_CS1.State.shape[0],1))

col_State=['state_'+str(k) for k in range(all_val_State.shape[1])]
X_xTest_CS1=pd.concat([X_xTest_CS1,pd.DataFrame(all_val_State,columns=col_State)],axis=1)
# # X_xTest_CS1.Country = le1.transform(X_xTest_CS.Country)
# # X_xTest_CS1['State'] = le2.transform(X_xTest_CS['State'])

# # #models_C[country] = gridSearchCV(model, X_Train_CS, y1_Train_CS, param_grid, 10, 'neg_mean_squared_error')
# # #models_F[country] = gridSearchCV(model, X_Train_CS, y2_Train_CS, param_grid, 10, 'neg_mean_squared_error')


In [None]:
y2_xTrain_CS

In [None]:


xgb1 = XGBRegressor(nthread=-1,n_jobs=-1) 

grid = GridSearchCV(xgb1, params)
grid.fit(X_xTrain_CS1, y1_xTrain_CS)




In [None]:
# Print the r2 score
print(r2_score(y1_xTest_CS, np.where(grid.best_estimator_.predict(X_xTest_CS1)<0,0,grid.best_estimator_.predict(X_xTest_CS1)))) 
print(RMSLE(np.where(y1_xTest_CS<=0,0.001,y1_xTest_CS), np.where(grid.best_estimator_.predict(X_xTest_CS1)<=0,0.001,grid.best_estimator_.predict(X_xTest_CS1))))

# # Save the file
# y1_xpred = grid.best_estimator_.predict(X_xTest_CS1)

# #         xmodel2 = XGBRegressor(n_estimators=1000)
# #         xmodel2.fit(X_xTrain_CS, y2_xTrain_CS)
# #         y2_xpred = xmodel2.predict(X_xTest_CS)
xgb2 = XGBRegressor(nthread=-1,n_jobs=-1)
grid1 = GridSearchCV(xgb2, params)
grid1.fit(X_xTrain_CS1, y2_xTrain_CS)

# Print the r2 score
print(r2_score(y2_xTest_CS, np.where(grid1.best_estimator_.predict(X_xTest_CS1)<0,0,grid1.best_estimator_.predict(X_xTest_CS1)))) 
print(RMSLE(np.where(y2_xTest_CS<=0,0.001,y2_xTest_CS), np.where(grid1.best_estimator_.predict(X_xTest_CS1)<=0,0.001,grid1.best_estimator_.predict(X_xTest_CS1))))
print(r2_score(y2_xTest_CS, grid1.best_estimator_.predict(X_xTest_CS1))) 
print(RMSLE(y2_xTest_CS, grid1.best_estimator_.predict(X_xTest_CS1))) 
# Save the file
y2_xpred = grid1.best_estimator_.predict(X_xTest_CS1)

# xdata = pd.DataFrame({'ForecastId': X_xTest_CS_Id, 'ConfirmedCases': y1_xpred, 'Fatalities': y2_xpred})
# xout = pd.concat([xout, xdata], axis=0)
# print('Total time for 1 run ' + str(time.time()-st_time))


In [None]:
X_xTrain_CS1_bk=X_xTrain_CS1.copy()

X_xTrain_CS12=X_xTrain_CS.copy()

all_val=oh1.transform(X_xTrain_CS12.Country.values.reshape(X_xTrain_CS12.Country.shape[0],1))
col=['cnty_'+str(k) for k in range(all_val.shape[1])]
X_xTrain_CS12=pd.concat([X_xTrain_CS12,pd.DataFrame(all_val,columns=col)],axis=1)


all_val_State=oh2.transform(X_xTrain_CS12.State.values.reshape(X_xTrain_CS12.State.shape[0],1))
col_State=['state_'+str(k) for k in range(all_val_State.shape[1])]
X_xTrain_CS12=pd.concat([X_xTrain_CS12,pd.DataFrame(all_val_State,columns=col_State)],axis=1)

X_xTrain_CS12=X_xTrain_CS12[X_xTrain_CS1.columns].copy()

X_xTrain_CS12['Date_ACT']=X_xTrain_CS['Date_ACT']
X_xTrain_CS12['Fatalities']=X_xTrain_CS['Fatalities']
X_xTrain_CS12['ConfirmedCases']=X_xTrain_CS['ConfirmedCases']

In [None]:
for k in range(60):
    X_xTrain_CS12
    prev=X_xTrain_CS12[X_xTrain_CS12['Date_ACT']==X_xTrain_CS12.Date_ACT.max()]
    lt=[ 'lag_1_ConfirmedCases','lag_2_ConfirmedCases', 'lag_3_ConfirmedCases',
           'lag_4_ConfirmedCases', 'lag_5_ConfirmedCases', 'lag_6_ConfirmedCases',
           'lag_7_ConfirmedCases', 'lag_8_ConfirmedCases', 'lag_9_ConfirmedCases',
           'lag_10_ConfirmedCases', 'lag_11_ConfirmedCases',
           'lag_12_ConfirmedCases', 'lag_13_ConfirmedCases',
           'lag_14_ConfirmedCases', 'lag_1_Fatalities','lag_2_Fatalities',
           'lag_3_Fatalities', 'lag_4_Fatalities', 'lag_5_Fatalities',
           'lag_6_Fatalities', 'lag_7_Fatalities', 'lag_8_Fatalities',
           'lag_9_Fatalities', 'lag_10_Fatalities', 'lag_11_Fatalities',
           'lag_12_Fatalities', 'lag_13_Fatalities', 'lag_14_Fatalities']
    #print(prev)
    lt1=['ConfirmedCases','lag_1_ConfirmedCases', 'lag_2_ConfirmedCases', 'lag_3_ConfirmedCases',
           'lag_4_ConfirmedCases', 'lag_5_ConfirmedCases', 'lag_6_ConfirmedCases',
           'lag_7_ConfirmedCases', 'lag_8_ConfirmedCases', 'lag_9_ConfirmedCases',
           'lag_10_ConfirmedCases', 'lag_11_ConfirmedCases',
           'lag_12_ConfirmedCases', 'lag_13_ConfirmedCases',
           'Fatalities', 'lag_1_Fatalities', 'lag_2_Fatalities',
           'lag_3_Fatalities', 'lag_4_Fatalities', 'lag_5_Fatalities',
           'lag_6_Fatalities', 'lag_7_Fatalities', 'lag_8_Fatalities',
           'lag_9_Fatalities', 'lag_10_Fatalities', 'lag_11_Fatalities',
           'lag_12_Fatalities', 'lag_13_Fatalities']
    prev[lt]=X_xTrain_CS12[X_xTrain_CS12['Date_ACT']==X_xTrain_CS12.Date_ACT.max()][lt1]
    #print(prev)
    cc=grid.best_estimator_.predict(X_xTrain_CS12[X_xTrain_CS12['Date_ACT']==X_xTrain_CS12.Date_ACT.max()][ ['State', 'Country']+k_log_col+col+col_State])
    prev['ConfirmedCases']=prev['lag_1_ConfirmedCases']+np.where(cc<=0,0.001,cc)
    Fl=grid1.best_estimator_.predict(X_xTrain_CS12[X_xTrain_CS12['Date_ACT']==X_xTrain_CS12.Date_ACT.max()][ ['State', 'Country']+k_log_col+col+col_State])
    prev['Fatalities']=prev['lag_1_Fatalities']+np.where(Fl<=0,0.001,Fl)
    import datetime
    prev['Date_ACT']=X_xTrain_CS12.Date_ACT.max()+  datetime.timedelta(days=1)
    prev.index=prev.index+1
    X_xTrain_CS12=X_xTrain_CS12.append(prev)
    #print(prev)

In [None]:
X_xTrain_CS2=X_xTrain_CS12.copy()

In [None]:
X_xTrain_CS2['Country_1']=lec.inverse_transform(X_xTrain_CS2['Country'])
X_xTrain_CS2['State_1']=les.inverse_transform(X_xTrain_CS2['State'])

In [None]:
X_xTrain_CS2

In [None]:
xtest = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-3/test.csv')
xsubmission = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-3/submission.csv')

xtest.rename(columns={'Country_Region':'Country'}, inplace=True)


xtest.rename(columns={'Province_State':'State'}, inplace=True)

#xtest.State=xtest.State.fillna('NA')
EMPTY_VAL = "NA"

def fillState(state, country):
    if state == EMPTY_VAL: return country
    return state
xtest['Date'] = pd.to_datetime(xtest['Date'], infer_datetime_format=True)
xtest['State'].fillna(EMPTY_VAL, inplace=True)
xtest['State'] = xtest.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1)
xtest

In [None]:
xtest1=xtest.merge(X_xTrain_CS2,left_on=['State','Country','Date'],right_on=['State_1','Country_1','Date_ACT'],how='inner')
xtest1

In [None]:
xtest1[(xtest1['Country_x']=='US') & (xtest1['State_x']=='New York')][['Date','ConfirmedCases','Fatalities']]

In [None]:
xtest1[(xtest1['Country_x']=='India') & (xtest1['State_x']=='India')][['Date','ConfirmedCases','Fatalities']]

In [None]:
xtest1[['Fatalities','ConfirmedCases','ForecastId']].to_csv('submission.csv', index=False)
# xtest.to_csv('submission.csv', index=False)
# xout.ForecastId = xout.ForecastId.astype('int')
# xout.tail()
# xout.to_csv('submission.csv', index=False)
print("Submission file Created.....")