#### Python ML Interview Code Snippets

#### Essential Dataframe Operations


In [1]:
import numpy as np
import pandas as pd

# create sample dataset

data = {'user_id':[1, 2, 3, 4, 5],
        'age':[25, 30, 35, np.nan, 28],
        'income': [50000, 75000, 60000, 100000, 90000],
        'category':['office assistant', 'assistant manager', 'staff', 'CEO', 'deputy manager']}

df = pd.DataFrame(data)

#### Handle Missing Values

In [2]:
def handle_missing_values(df, strategy='mean'):
    if strategy == 'mean':
        return df.fillna(df.mean(numeric_only=True))
    elif strategy == 'median':
        return df.fillna(df.median(numeric_only=True))
    elif strategy == 'mode':
        return df.fillna(df.mode(), iloc[0])
    else:
        return df.dropna()
    
print(handle_missing_values(df, strategy='mean'))


   user_id   age  income           category
0        1  25.0   50000   office assistant
1        2  30.0   75000  assistant manager
2        3  35.0   60000              staff
3        4  29.5  100000                CEO
4        5  28.0   90000     deputy manager


#### Advanced groupby operations

In [3]:
def advanced_aggregation(df):
    return df.groupby('category').agg({'age': ['mean', 'std', 'count'],
                                       'income':[sum, max, min]}).round(2)

print(advanced_aggregation(df))

                    age            income                
                   mean std count     sum     max     min
category                                                 
CEO                 NaN NaN     0  100000  100000  100000
assistant manager  30.0 NaN     1   75000   75000   75000
deputy manager     28.0 NaN     1   90000   90000   90000
office assistant   25.0 NaN     1   50000   50000   50000
staff              35.0 NaN     1   60000   60000   60000


  return df.groupby('category').agg({'age': ['mean', 'std', 'count'],
  return df.groupby('category').agg({'age': ['mean', 'std', 'count'],
  return df.groupby('category').agg({'age': ['mean', 'std', 'count'],


### Data Transformation Pipeline

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline

# create preprocessing pipeline

def create_preprocessing_pipeline():
    """
    production-ready preprocessing pipeline
    """
    # Numerical Features
    numerical_transformer = Pipeline(steps=[('scaler', StandardScaler)])

    # Categorical Features
    categorical_transformer = Pipeline(steps=[('encoder', LabelEncoder)])

    return numerical_transformer, categorical_transformer

In [None]:
# Feature scaling comparison

from sklearn.preprocessing import MinMaxScaler, RobustScaler

def compare_scaling_method(data):
    """"
    Demonstrate understanding of different scaling techniques
    """
    scalers = {
        'StandardScaler' : StandardScaler(),
        'MinMaxScaler' : MinMaxScaler(),
        'RobustScaler' : RobustScaler()
    }

    results = {}
    for name, scaler in scalers.items():
        scaled_data = scaler.fit_transform(data.reshape(-1,1))
        results[name] = {
            'mean': np.mean(scaled_data),
            'std' : np.std(scaled_data),
            'range': (np.min(scaled_data), np.max(scaled_data))
        }

    return results

In [6]:
sample_data = np.array([20, 22, 23, 24, 25, 26, 100])
results_stat = compare_scaling_method(sample_data)
print(results_stat)


{'StandardScaler': {'mean': 0.0, 'std': 1.0, 'range': (-0.5312689371697423, 2.443837110980815)}, 'MinMaxScaler': {'mean': 0.17857142857142858, 'std': 0.33612247221292807, 'range': (0.0, 1.0)}, 'RobustScaler': {'mean': 3.4285714285714284, 'std': 8.963265925678083, 'range': (-1.3333333333333333, 25.333333333333332)}}
