# Let's prepare for the conding interview with Intelmatix


In [7]:
import os
import torch
from tqdm import tqdm 
import sklearn as sk
import pandas as pd


# Creating, Saving and Reading data

In [None]:
# Creating a Series from a list
s = pd.Series([10, 20, 30, 40], name="age")
print(s)

0    10
1    20
2    30
3    40
Name: age, dtype: int64


In [42]:
# Creating a DataFrame from a dictionary
data = {
    "name": ["Alice", "Bob", "Charlie", "David", "Lukas"],
    "age": [25, 30, 35, 40, 45],
    "city": ["NY", "LA", "Chicago", "Houston", "Houston"],
    "salary": [70000, 80000, 90000, 100000, 122222]
}

df = pd.DataFrame(data)
print(df)

      name  age     city  salary
0    Alice   25       NY   70000
1      Bob   30       LA   80000
2  Charlie   35  Chicago   90000
3    David   40  Houston  100000
4    Lukas   45  Houston  122222


In [11]:
# Writing to a CSV file
df.to_csv("./example.csv", index=False)

# Reading a CSV file
df = pd.read_csv("./example.csv")
print(df.head())

      name  age     city
0    Alice   25       NY
1      Bob   30       LA
2  Charlie   35  Chicago
3    David   40  Houston


# Indexing

In [17]:
# df.loc[label]: label-based indexing.
# df.iloc[pos]: integer-position-based indexing.

# label-based
print(df.loc[0, "age"])  # row label 0, column "age"

# integer-position-based
print(df.iloc[0, 1])     # row 0, column 1


# BOOLEAN MASKING: All rows where age > 30
print(df[df["age"] > 30])

# CONDITIONAL FILTERING: All rows where age > 30
print(df[(df["age"] >= 30) & (df["city"] == "LA")])

25
25
      name  age     city
2  Charlie   35  Chicago
3    David   40  Houston
  name  age city
1  Bob   30   LA


# Data cleaning and transformation

df["age"].fillna(df["age"].mean(), inplace=True)

In [23]:
# Handling Missing Values

# df.isnull(): checks for NaN/Nulls.
# df.dropna(): drops rows (or columns) with NaN values.
# df.fillna(value): fills NaN with a value or a strategy.

df.iloc[0, 1] = None
print(df)
print()
print(df.isnull()) # Check for NaN values
print()
print(df.isnull().sum())  # Count of NaN values in each column
print()
# print(df.dropna())  # Drop rows with NaN values
# print()

print(df["age"].mean())
df["age"].fillna(df["age"].mean(), inplace=True)
print(df)

      name   age     city
0    Alice   NaN       NY
1      Bob  30.0       LA
2  Charlie  35.0  Chicago
3    David  40.0  Houston

    name    age   city
0  False   True  False
1  False  False  False
2  False  False  False
3  False  False  False

name    0
age     1
city    0
dtype: int64

35.0
      name   age     city
0    Alice  35.0       NY
1      Bob  30.0       LA
2  Charlie  35.0  Chicago
3    David  40.0  Houston


In [30]:
# Changing Data Types
df["age"] = df["age"].astype(int)     # Convert to integer
print(df)
print()

# String Operations
df["city"] = df["city"].str.lower()                # Lowercase all city strings
df["name"] = df["name"].str.replace(" ", "_")      # Replace spaces with underscores
print(df)


      name  age     city
0    Alice   25       NY
1      Bob   30       LA
2  Charlie   35  Chicago
3    David   40  Houston

      name  age     city
0    Alice   25       ny
1      Bob   30       la
2  Charlie   35  chicago
3    David   40  houston


In [None]:
# .apply() to a Series
df["age_plus_ten"] = df["age"].apply(lambda x: x + 10)
print(df)
print()

# .apply() to a DataFrame row-wise
df["info"] = df.apply(lambda row: f"{row['name']}_{row['city']}", axis=1)
print(df)
print()


      name  age     city  age_plus_ten
0    Alice   25       ny            35
1      Bob   30       la            40
2  Charlie   35  chicago            45
3    David   40  houston            50

      name  age     city  age_plus_ten             info
0    Alice   25       ny            35         Alice_ny
1      Bob   30       la            40           Bob_la
2  Charlie   35  chicago            45  Charlie_chicago
3    David   40  houston            50    David_houston



In [34]:
# Renaming Columns
df.rename(columns={"city": "location"}, inplace=True)
print(df)

      name  age location  age_plus_ten             info
0    Alice   25       ny            35         Alice_ny
1      Bob   30       la            40           Bob_la
2  Charlie   35  chicago            45  Charlie_chicago
3    David   40  houston            50    David_houston


In [35]:
df.sort_values(by="age", ascending=False, inplace=True)
print(df)

      name  age location  age_plus_ten             info
3    David   40  houston            50    David_houston
2  Charlie   35  chicago            45  Charlie_chicago
1      Bob   30       la            40           Bob_la
0    Alice   25       ny            35         Alice_ny


# Aggregation and grouping

In [43]:
print(df)
print()
# Basic aggregation
grouped = df.groupby("city")["age"].mean()
print(grouped)
print()

# Multiple aggregations on multiple columns
grouped2 = df.groupby("city").agg({
    "age": ["mean", "max"],
    "salary": "sum"
})

print(grouped2)
print()

      name  age     city  salary
0    Alice   25       NY   70000
1      Bob   30       LA   80000
2  Charlie   35  Chicago   90000
3    David   40  Houston  100000
4    Lukas   45  Houston  122222

city
Chicago    35.0
Houston    42.5
LA         30.0
NY         25.0
Name: age, dtype: float64

          age      salary
         mean max     sum
city                     
Chicago  35.0  35   90000
Houston  42.5  45  222222
LA       30.0  30   80000
NY       25.0  25   70000



In [55]:
# Creating a DataFrame from a dictionary
data = {
    "name": ["Alice", "Bob", "Charlie", "David", "Lukas"],
    "age": [25, 30, 35, 40, 45],
    "city": ["NY", "LA", "Chicago", "Houston", "Houston"],
    "salary": [70000, 80000, 90000, 100000, 122222]
}

df1 = pd.DataFrame(data)

# Creating a DataFrame from a dictionary
data = {
    "name": ["Alice", "Bob", "Charlie", "David"],
    "age": [25, 30, 35, 40],
    "city": ["NY", "LA", "Chicago", "Houston"],
    "salary": [70000, 80000, 90000, 100000]
}

df2 = pd.DataFrame(data)

df_combined = pd.concat([df1, df2], axis=0)  # stack rows
print(df_combined)
print()
df_side_by_side = pd.concat([df1, df2], axis=1)  # side-by-side
print(df_side_by_side)
print()

merged_df = pd.merge(df1, df2, on="name", how="inner")
print(merged_df)
print()

      name  age     city  salary
0    Alice   25       NY   70000
1      Bob   30       LA   80000
2  Charlie   35  Chicago   90000
3    David   40  Houston  100000
4    Lukas   45  Houston  122222
0    Alice   25       NY   70000
1      Bob   30       LA   80000
2  Charlie   35  Chicago   90000
3    David   40  Houston  100000

      name  age     city  salary     name   age     city    salary
0    Alice   25       NY   70000    Alice  25.0       NY   70000.0
1      Bob   30       LA   80000      Bob  30.0       LA   80000.0
2  Charlie   35  Chicago   90000  Charlie  35.0  Chicago   90000.0
3    David   40  Houston  100000    David  40.0  Houston  100000.0
4    Lukas   45  Houston  122222      NaN   NaN      NaN       NaN

      name  age_x   city_x  salary_x  age_y   city_y  salary_y
0    Alice     25       NY     70000     25       NY     70000
1      Bob     30       LA     80000     30       LA     80000
2  Charlie     35  Chicago     90000     35  Chicago     90000
3    David    

In [61]:
grouped = df.groupby(["city", "salary"]).mean(numeric_only=True)

print(grouped)


                 age
city    salary      
Chicago 90000   35.0
Houston 100000  40.0
        122222  45.0
LA      80000   30.0
NY      70000   25.0


In [65]:
df["city"] = df["city"].astype("category")
print(df)

      name  age     city  salary
0    Alice   25       NY   70000
1      Bob   30       LA   80000
2  Charlie   35  Chicago   90000
3    David   40  Houston  100000
4    Lukas   45  Houston  122222


# SKLEARN

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [5]:
# Generate a Dataset
np.random.seed(42)

n_samples = 100

data = {
    "age": np.random.randint(20, 60, size=n_samples),
    "income": np.random.normal(50000, 15000, size=n_samples),
    "city": np.random.choice(["New York", "San Francisco", "Chicago", "Houston"], size=n_samples),
    "gender": np.random.choice(["Male", "Female", "Other"], size=n_samples),
    "education_level": np.random.choice(["High School", "Bachelors", "Masters", "PhD"], size=n_samples),
    "purchased": np.random.choice([0, 1], size=n_samples)
}

df = pd.DataFrame(data)

# Introduce missing values randomly
for col in ["age", "income", "city"]:
    df.loc[df.sample(frac=0.1).index, col] = np.nan

print(df.head())


    age        income           city  gender education_level  purchased
0  58.0  47334.018181        Chicago    Male     High School          1
1  48.0  43844.250370            NaN   Other         Masters          0
2  34.0  67695.745171        Houston    Male       Bachelors          0
3  27.0  36526.880908        Houston   Other             PhD          1
4  40.0  62521.931288  San Francisco  Female       Bachelors          0


In [7]:
# Define Features and Target

X = df.drop("purchased", axis=1)
y = df["purchased"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [13]:
# numeric_features = ["age", "income"]
# categorical_features = ["city", "gender", "education_level"]

# numeric_transformer = Pipeline([
#     ("imputer", SimpleImputer(strategy="mean")),
#     ("scaler", StandardScaler())
# ])

# categorical_transformer = Pipeline([
#     ("imputer", SimpleImputer(strategy="most_frequent")),
#     ("onehot", OneHotEncoder(handle_unknown="ignore"))
# ])

# preprocessor = ColumnTransformer([
#     ("num", numeric_transformer, numeric_features),
#     ("cat", categorical_transformer, categorical_features)
# ])

# pipeline = Pipeline([
#     ("preprocessor", preprocessor),
#     ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
# ])


#  Alternatively: 
#  Handle numerical features: 

numeric_features = ["age", "income"]

# Impute missing values
num_imputer = SimpleImputer(strategy="mean")
X_train_num = num_imputer.fit_transform(X_train[numeric_features])
X_test_num = num_imputer.transform(X_test[numeric_features])

# Scale
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train_num)
X_test_num = scaler.transform(X_test_num)


# Handle categorical features:
categorical_features = ["city", "gender", "education_level"]

# Impute missing values
cat_imputer = SimpleImputer(strategy="most_frequent")
X_train_cat = cat_imputer.fit_transform(X_train[categorical_features])
X_test_cat = cat_imputer.transform(X_test[categorical_features])

# One-hot encode
encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
X_train_cat = encoder.fit_transform(X_train_cat)
X_test_cat = encoder.transform(X_test_cat)


X_train_final = np.hstack([X_train_num, X_train_cat])
X_test_final = np.hstack([X_test_num, X_test_cat])




In [14]:
# Train and Evaluate Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_final, y_train)

y_pred = model.predict(X_test_final)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.45
              precision    recall  f1-score   support

           0       0.75      0.40      0.52        15
           1       0.25      0.60      0.35         5

    accuracy                           0.45        20
   macro avg       0.50      0.50      0.44        20
weighted avg       0.62      0.45      0.48        20

