In [302]:
import pandas as pd
import numpy as np



In [296]:
def SimpleImputer(df,strategy='mean',columns= [],fill_value=0):
    df.index+=1
    if strategy not in ['mean', 'median','mode','constant']:
        raise ValueError("Strategy must be one of 'mean', 'median', 'mode' or 'constant'.")
    if strategy == 'mean':
        for i in columns:
            mean = 0
            x = df[i].dropna()
            x.index = [i+1 for i in range(len(x))]
            for j in range(1,len(x)+1):
                mean+=x[j]
            mean = mean /(len(df)-df[i].isna().sum())
            df[i] = np.where(df[i].isna(),mean,df[i])
            
    elif strategy == 'median':
        for i in columns:
            median = 0
            x = df[i].dropna()
            x = sorted(list(x))
            if len(x)%2:
                median = x[len(x)//2]
            else:
                median = (x[len(x)//2] + x[len(x)//2-1])/2
            df[i] = np.where(df[i].isna(),median,df[i])
            
    elif strategy == 'mode':
        for i in columns:
            mode = 0
            x = df[i].dropna()
            y = list(df[i].dropna())
            x = sorted(list(set(x)))
            x = [(m,y.count(m)) for m in x]
            max = x[0]
            for j in x:
                if j[1]>max[1]:
                    max = j
                    
            mode = max[0]                 
            df[i] = np.where(df[i].isna(),mode,df[i])
            
    elif strategy=="constant":
        for i in columns:
            df[i] = np.where(df[i].isna(),fill_value,df[i])

        
    df.reset_index(inplace=True,drop=True)
    return df

In [305]:
example_arr = np.array([[1,42,1,42,np.nan],[56,7,np.nan,3,39]]).reshape(5,-1)
df = pd.DataFrame(example_arr)
df.columns = ['col1','col2']
df


Unnamed: 0,col1,col2
0,1.0,42.0
1,1.0,42.0
2,,56.0
3,7.0,
4,3.0,39.0


In [306]:
SimpleImputer(df,'mean',columns=['col2'])


Unnamed: 0,col1,col2
0,1.0,42.0
1,1.0,42.0
2,,56.0
3,7.0,44.75
4,3.0,39.0


In [309]:
data = pd.read_excel('sample.xlsx')
data

Unnamed: 0,col 1,col 2,col3
0,1.0,a,d
1,2.0,,e
2,3.0,c,d
3,,b,
4,5.0,a,d


In [310]:
SimpleImputer(data,'median',['col 1'])

Unnamed: 0,col 1,col 2,col3
0,1.0,a,d
1,2.0,,e
2,3.0,c,d
3,2.5,b,
4,5.0,a,d


In [245]:
def OrdinalEncoder(df,columns = [], categories = []):
    for i in columns:
        if categories: 
            x = [i for i in range(len(categories))]
            new = dict(zip(categories,x))
            for m,n in enumerate(df[i]):
                if n in categories:
                    df[i][m] = new[n]
                    
        else:
            y = sorted(set(df[i].dropna()))
            new = dict(zip(y,range(len(y))))
            for m,n in enumerate(df[i]):
                if n in y:
                    df[i][m] = new[n]
            
    return df


   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i][m] = new[n]


Unnamed: 0,col 1,col 2,col3
0,1.0,0.0,d
1,2.0,,e
2,3.0,2.0,d
3,2.5,1.0,
4,5.0,0.0,d


In [311]:
df = pd.read_excel('sample.xlsx')
df

Unnamed: 0,col 1,col 2,col3
0,1.0,a,d
1,2.0,,e
2,3.0,c,d
3,,b,
4,5.0,a,d


In [312]:
OrdinalEncoder(data,['col3'])
data     

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i][m] = new[n]


Unnamed: 0,col 1,col 2,col3
0,1.0,a,0.0
1,2.0,,1.0
2,3.0,c,0.0
3,2.5,b,
4,5.0,a,0.0


In [313]:
df = pd.read_excel('sample.xlsx')
df

Unnamed: 0,col 1,col 2,col3
0,1.0,a,d
1,2.0,,e
2,3.0,c,d
3,,b,
4,5.0,a,d


In [314]:
OrdinalEncoder(data,['col 2'],categories=['b','c','a'])
data     

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i][m] = new[n]


Unnamed: 0,col 1,col 2,col3
0,1.0,2.0,0.0
1,2.0,,1.0
2,3.0,1.0,0.0
3,2.5,0.0,
4,5.0,2.0,0.0


In [317]:
df = pd.read_excel('sample.xlsx')
df

Unnamed: 0,col 1,col 2,col3
0,1.0,a,d
1,2.0,,e
2,3.0,c,d
3,,b,
4,5.0,a,d


In [318]:
def OneHotEncoder(df,columns=[]):
    for i in columns:
        x = sorted(set(df[i].dropna()))
        for j in x:
            df[j] = [1 if n==j else 0 for n in df[i]]
    return df

In [322]:
OneHotEncoder(df,columns=['col 2','col3']) 

Unnamed: 0,col 1,col 2,col3,a,b,c,d,e
0,1.0,a,d,1,0,0,1,0
1,2.0,,e,0,0,0,0,1
2,3.0,c,d,0,0,1,1,0
3,,b,,0,1,0,0,0
4,5.0,a,d,1,0,0,1,0
