<a href="https://colab.research.google.com/github/PodBar/machine_learning_bootcamp/blob/master/supervised/01_basics/03_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import sklearn

sklearn.__version__

'0.22.2.post1'

In [0]:
def fetch_financial_data(company="AMZN"):
  """
  This function fetches stock market quotations
  """
  import pandas_datareader.data as web
  return web.DataReader(name=company,data_source="stooq")

df_raw = fetch_financial_data()
df_raw.head()

  from pandas.util.testing import assert_frame_equal


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-17,2372.33,2400.0,2316.02,2375.0,7930010
2020-04-16,2346.0,2460.9981,2335.0,2408.19,12038201
2020-04-15,2257.68,2333.369,2245.0,2307.68,6866567
2020-04-14,2200.47,2292.0,2186.21,2283.32,8087193
2020-04-13,2040.0,2180.0,2038.0,2168.87,6716709


In [0]:
df = df_raw.copy()
df = df[:5]
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2020-04-17 to 2020-04-13
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    5 non-null      float64
 1   High    5 non-null      float64
 2   Low     5 non-null      float64
 3   Close   5 non-null      float64
 4   Volume  5 non-null      int64  
dtypes: float64(4), int64(1)
memory usage: 240.0 bytes


In [0]:
df["day"] = df.index.day
df["month"] = df.index.month
df["year"] = df.index.year
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,day,month,year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-04-17,2372.33,2400.0,2316.02,2375.0,7930010,17,4,2020
2020-04-16,2346.0,2460.9981,2335.0,2408.19,12038201,16,4,2020
2020-04-15,2257.68,2333.369,2245.0,2307.68,6866567,15,4,2020
2020-04-14,2200.47,2292.0,2186.21,2283.32,8087193,14,4,2020
2020-04-13,2040.0,2180.0,2038.0,2168.87,6716709,13,4,2020


In [0]:
df = pd.DataFrame(data={"height":[175.,178.5,185.,191.,184.5,183.,168.,]})
df

Unnamed: 0,height
0,175.0
1,178.5
2,185.0
3,191.0
4,184.5
5,183.0
6,168.0


In [0]:
df["height_cat"] = pd.cut(x=df.height,bins=3)
df

Unnamed: 0,height,height_cat
0,175.0,"(167.977, 175.667]"
1,178.5,"(175.667, 183.333]"
2,185.0,"(183.333, 191.0]"
3,191.0,"(183.333, 191.0]"
4,184.5,"(183.333, 191.0]"
5,183.0,"(175.667, 183.333]"
6,168.0,"(167.977, 175.667]"


In [0]:
df["height_cat"] = pd.cut(x=df.height,bins=(160,175,180,195))
df

Unnamed: 0,height,height_cat
0,175.0,"(160, 175]"
1,178.5,"(175, 180]"
2,185.0,"(180, 195]"
3,191.0,"(180, 195]"
4,184.5,"(180, 195]"
5,183.0,"(180, 195]"
6,168.0,"(160, 175]"


In [0]:
df["height_cat"] = pd.cut(x=df.height,bins=(160,175,180,195),labels=["small","medium","high"])
df

Unnamed: 0,height,height_cat
0,175.0,small
1,178.5,medium
2,185.0,high
3,191.0,high
4,184.5,high
5,183.0,high
6,168.0,small


In [0]:
pd.get_dummies(df,drop_first=True,prefix="height")

Unnamed: 0,height,height_medium,height_high
0,175.0,0,0
1,178.5,1,0
2,185.0,0,1
3,191.0,0,1
4,184.5,0,1
5,183.0,0,1
6,168.0,0,0


In [0]:
df = pd.DataFrame(data={"lang":[["PL","Eng"],["PL","Eng","Ger","Fra"],["Rus"]]})
df

Unnamed: 0,lang
0,"[PL, Eng]"
1,"[PL, Eng, Ger, Fra]"
2,[Rus]


In [0]:
df["lang_number"] = df["lang"].apply(len)

In [0]:
df

Unnamed: 0,lang,lang_number
0,"[PL, Eng]",2
1,"[PL, Eng, Ger, Fra]",4
2,[Rus],1


In [0]:
df["PL_flag"]=df["lang"].apply(lambda x: 1 if "PL" in x else 0)
df

Unnamed: 0,lang,lang_number,PL_flag
0,"[PL, Eng]",2,1
1,"[PL, Eng, Ger, Fra]",4,1
2,[Rus],1,0


In [0]:
df = pd.DataFrame(data={"website":["wp.pl","onet.pl","google.com"]})
df

Unnamed: 0,website
0,wp.pl
1,onet.pl
2,google.com


In [0]:
new = df.website.str.split(".",expand=True)

In [0]:
new

Unnamed: 0,0,1
0,wp,pl
1,onet,pl
2,google,com


In [0]:
new.rename(columns = {0:"portal",1:"extension"},inplace=True)

In [0]:
final = df.join(new)

In [0]:
final

Unnamed: 0,website,portal,extension
0,wp.pl,wp,pl
1,onet.pl,onet,pl
2,google.com,google,com
