
*   Computing Platforms: Set up the Workspace for Machine Learning Projects.  https://ms.pubpub.org/pub/computing
*  Machine Learning for Predictions. https://ms.pubpub.org/pub/ml-prediction
* Machine Learning Packages: https://scikit-learn.org/stable/


# Part I: Import and Inspect Data

In [2]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('./foreign_investment.csv')
df.head()

Unnamed: 0,Year,FDI,GDP,VALUE,SAVING,SAVING_RATE
0,1980,0.02982,191149211648,3822984233,67495989500,35.310629
1,1981,0.135296,195866378240,3917327565,65779071687,33.583646
2,1982,0.209664,205089701888,4101794038,68568971469,33.433649
3,1983,0.275699,230686752768,4613735055,75085127996,32.548522
4,1984,0.483946,259946512384,5198930248,88994888904,34.235846


# Part II: Prepare the Y varible for Regression

## 2.1. Write functions to calculte the Y variable for Regression 

*(skip the step if the Y variable already exists)*

In [4]:
df['iyoy'] = df['FDI']-df['FDI'].shift(1)
df['syoy'] = df['SAVING_RATE']-df['SAVING_RATE'].shift(1)
df.head()

Unnamed: 0,Year,FDI,GDP,VALUE,SAVING,SAVING_RATE,iyoy,syoy
0,1980,0.02982,191149211648,3822984233,67495989500,35.310629,,
1,1981,0.135296,195866378240,3917327565,65779071687,33.583646,0.105477,-1.726983
2,1982,0.209664,205089701888,4101794038,68568971469,33.433649,0.074368,-0.149997
3,1983,0.275699,230686752768,4613735055,75085127996,32.548522,0.066034,-0.885127
4,1984,0.483946,259946512384,5198930248,88994888904,34.235846,0.208247,1.687325


## 2.2. Make Sure that the Data Type of Y is "numeric"

In [5]:
df.dtypes

Year             int64
FDI            float64
GDP              int64
VALUE            int64
SAVING           int64
SAVING_RATE    float64
iyoy           float64
syoy           float64
dtype: object

# Part III: Prepare the Y variable for Classification

reference:

https://datatofish.com/if-condition-in-pandas-dataframe/ *italicized text*

In [6]:
#@title Define the Congestion Threshold
cut = 0.0 #@param {type:"number"}

In [7]:
df['icongested'] = df['iyoy'] >= cut
df['scongested'] = df['syoy'] >= cut
df.head(10)

Unnamed: 0,Year,FDI,GDP,VALUE,SAVING,SAVING_RATE,iyoy,syoy,icongested,scongested
0,1980,0.02982,191149211648,3822984233,67495989500,35.310629,,,False,False
1,1981,0.135296,195866378240,3917327565,65779071687,33.583646,0.105477,-1.726983,True,False
2,1982,0.209664,205089701888,4101794038,68568971469,33.433649,0.074368,-0.149997,True,False
3,1983,0.275699,230686752768,4613735055,75085127996,32.548522,0.066034,-0.885127,True,False
4,1984,0.483946,259946512384,5198930248,88994888904,34.235846,0.208247,1.687325,True,True
5,1985,0.536047,309488025600,6189760512,108101540816,34.929151,0.052101,0.693305,True,True
6,1986,0.623425,300758106112,6015162122,105661789478,35.131818,0.087378,0.202666,True,True
7,1987,0.847703,272972972032,5459459441,101593543706,37.217437,0.224278,2.085619,True,True
8,1988,1.022559,312353619968,6247072399,118333692506,37.884527,0.174856,0.66709,True,True
9,1989,0.97565,347768061952,6955361239,124211785334,35.716847,-0.046909,-2.167681,False,False


In [8]:
df.loc[(df['iyoy'] >= cut), 'icongested'] = 1
df.loc[(df['iyoy'] <cut), 'icongested'] = 0
df.loc[(df['syoy'] >= cut), 'scongested'] = 1
df.loc[(df['syoy'] <cut), 'scongested'] = 0
df.head(10)

Unnamed: 0,Year,FDI,GDP,VALUE,SAVING,SAVING_RATE,iyoy,syoy,icongested,scongested
0,1980,0.02982,191149211648,3822984233,67495989500,35.310629,,,False,False
1,1981,0.135296,195866378240,3917327565,65779071687,33.583646,0.105477,-1.726983,1,0
2,1982,0.209664,205089701888,4101794038,68568971469,33.433649,0.074368,-0.149997,1,0
3,1983,0.275699,230686752768,4613735055,75085127996,32.548522,0.066034,-0.885127,1,0
4,1984,0.483946,259946512384,5198930248,88994888904,34.235846,0.208247,1.687325,1,1
5,1985,0.536047,309488025600,6189760512,108101540816,34.929151,0.052101,0.693305,1,1
6,1986,0.623425,300758106112,6015162122,105661789478,35.131818,0.087378,0.202666,1,1
7,1987,0.847703,272972972032,5459459441,101593543706,37.217437,0.224278,2.085619,1,1
8,1988,1.022559,312353619968,6247072399,118333692506,37.884527,0.174856,0.66709,1,1
9,1989,0.97565,347768061952,6955361239,124211785334,35.716847,-0.046909,-2.167681,0,0


# 3.3. Method 3: Lambda function

In [9]:
df['icongested'] = df['iyoy'].apply(lambda x: 1 if x>= cut else 0)
df['scongested'] = df['syoy'].apply(lambda x: 1 if x>= cut else 0)
df.head(10)

Unnamed: 0,Year,FDI,GDP,VALUE,SAVING,SAVING_RATE,iyoy,syoy,icongested,scongested
0,1980,0.02982,191149211648,3822984233,67495989500,35.310629,,,0,0
1,1981,0.135296,195866378240,3917327565,65779071687,33.583646,0.105477,-1.726983,1,0
2,1982,0.209664,205089701888,4101794038,68568971469,33.433649,0.074368,-0.149997,1,0
3,1983,0.275699,230686752768,4613735055,75085127996,32.548522,0.066034,-0.885127,1,0
4,1984,0.483946,259946512384,5198930248,88994888904,34.235846,0.208247,1.687325,1,1
5,1985,0.536047,309488025600,6189760512,108101540816,34.929151,0.052101,0.693305,1,1
6,1986,0.623425,300758106112,6015162122,105661789478,35.131818,0.087378,0.202666,1,1
7,1987,0.847703,272972972032,5459459441,101593543706,37.217437,0.224278,2.085619,1,1
8,1988,1.022559,312353619968,6247072399,118333692506,37.884527,0.174856,0.66709,1,1
9,1989,0.97565,347768061952,6955361239,124211785334,35.716847,-0.046909,-2.167681,0,0


## 3.2. Method 2: Cut function

reference: 

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html

In [10]:
import numpy as np
 
icongested = pd.cut(df['iyoy'], bins=[-1,0,1], labels=[0,1]) #might have problems at boundaries
df.insert(3, 'icongested2',icongested)

In [11]:
df.head(10)

Unnamed: 0,Year,FDI,GDP,icongested2,VALUE,SAVING,SAVING_RATE,iyoy,syoy,icongested,scongested
0,1980,0.02982,191149211648,,3822984233,67495989500,35.310629,,,0,0
1,1981,0.135296,195866378240,1.0,3917327565,65779071687,33.583646,0.105477,-1.726983,1,0
2,1982,0.209664,205089701888,1.0,4101794038,68568971469,33.433649,0.074368,-0.149997,1,0
3,1983,0.275699,230686752768,1.0,4613735055,75085127996,32.548522,0.066034,-0.885127,1,0
4,1984,0.483946,259946512384,1.0,5198930248,88994888904,34.235846,0.208247,1.687325,1,1
5,1985,0.536047,309488025600,1.0,6189760512,108101540816,34.929151,0.052101,0.693305,1,1
6,1986,0.623425,300758106112,1.0,6015162122,105661789478,35.131818,0.087378,0.202666,1,1
7,1987,0.847703,272972972032,1.0,5459459441,101593543706,37.217437,0.224278,2.085619,1,1
8,1988,1.022559,312353619968,1.0,6247072399,118333692506,37.884527,0.174856,0.66709,1,1
9,1989,0.97565,347768061952,0.0,6955361239,124211785334,35.716847,-0.046909,-2.167681,0,0


# Part III: Create the X variables

## 3.1. Shift the Y to get past values

reference:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.shift.html

# Part IV Train and Test Split

In [12]:
from sklearn.model_selection import TimeSeriesSplit

In [13]:
tss = TimeSeriesSplit()
for train_idx, test_idx in tss.split(df):
    pass

In [14]:
train_df = df.filter(items=train_idx, axis=0)
test_df =  df.filter(items=test_idx, axis=0)
train_df.head()

Unnamed: 0,Year,FDI,GDP,icongested2,VALUE,SAVING,SAVING_RATE,iyoy,syoy,icongested,scongested
0,1980,0.02982,191149211648,,3822984233,67495989500,35.310629,,,0,0
1,1981,0.135296,195866378240,1.0,3917327565,65779071687,33.583646,0.105477,-1.726983,1,0
2,1982,0.209664,205089701888,1.0,4101794038,68568971469,33.433649,0.074368,-0.149997,1,0
3,1983,0.275699,230686752768,1.0,4613735055,75085127996,32.548522,0.066034,-0.885127,1,0
4,1984,0.483946,259946512384,1.0,5198930248,88994888904,34.235846,0.208247,1.687325,1,1


In [15]:
test_df.head()

Unnamed: 0,Year,FDI,GDP,icongested2,VALUE,SAVING,SAVING_RATE,iyoy,syoy,icongested,scongested
35,2015,2.192182,11061553004544,0,221231060091,5089321787006,46.009107,-0.367052,-1.465948,0,0
36,2016,1.555642,11233276198912,0,224665523978,5050623559476,44.961269,-0.636539,-1.047838,0,0
37,2017,1.349133,12310409117696,0,246208182354,5555981442269,45.132387,-0.206509,0.171117,0,1
38,2018,1.693905,13894817939456,1,277896358789,6244652135107,44.94231,0.344773,-0.190077,1,0
39,2019,1.310719,14279937884160,0,285598757683,6280520248263,43.981426,-0.383187,-0.960884,0,0


# Part IV Prepare the Train and Test Data for Classification and Regression

In [16]:
df_C =df[['scongested','iyoy']]
df_R =df[['syoy','iyoy']]

In [17]:
# please complete the code on your own
for train_idx, test_idx in tss.split(df_C):
    pass
df_C_train = df_C.filter(items=train_idx, axis=0)
df_C_test =  df_C.filter(items=test_idx, axis=0)
df_C_train.head()

Unnamed: 0,scongested,iyoy
0,0,
1,0,0.105477
2,0,0.074368
3,0,0.066034
4,1,0.208247


In [18]:
df_C_test.head()

Unnamed: 0,scongested,iyoy
35,0,-0.367052
36,0,-0.636539
37,1,-0.206509
38,0,0.344773
39,0,-0.383187


In [19]:
# please complete the code on your own
for train_idx, test_idx in tss.split(df_R):
    pass
df_R_train = df_R.filter(items=train_idx, axis=0)
df_R_test =  df_R.filter(items=test_idx, axis=0)
df_R_train.head()

Unnamed: 0,syoy,iyoy
0,,
1,-1.726983,0.105477
2,-0.149997,0.074368
3,-0.885127,0.066034
4,1.687325,0.208247


In [20]:
df_R_test.head()

Unnamed: 0,syoy,iyoy
35,-1.465948,-0.367052
36,-1.047838,-0.636539
37,0.171117,-0.206509
38,-0.190077,0.344773
39,-0.960884,-0.383187


In [21]:
#export and save the data to the processed data folder, please complete the code on your own
df_C_train.to_csv('df3_C_train.csv')
df_C_test.to_csv('df3_C_test.csv')

In [22]:
df_R_train.to_csv('df3_R_train.csv')
df_R_test.to_csv('df3_R_test.csv')