In [1]:
# Write a function validate_data(data) that checks if a list of dictionaries 
# (e.g., [{"name": "Alice", "age": 30}, {"name": "Bob", "age": "25"}]) contains 
# valid integer values for the "age" key. Return a list of invalid entries.

In [2]:
def validate_data(data):
    invalid_entries = []
    
    for entry in data:
        # Check if 'age' key exists and the value is an integer
        if 'age' not in entry or not isinstance(entry['age'], int):
            invalid_entries.append(entry)
    
    return invalid_entries


In [3]:
data = [
    {"name": "Alice", "age": 30},
    {"name": "Bob", "age": "25"},
    {"name": "Charlie"},
    {"name": "David", "age": 40}
]

print(validate_data(data))


[{'name': 'Bob', 'age': '25'}, {'name': 'Charlie'}]


In [2]:
# Create a decorator @log_execution_time that logs the time taken to execute a function.
# Use it to log the runtime of a sample function calculate_sum(n) that returns the sum of numbers from 1 to n.
import time
import functools

# Decorator to log execution time
def log_execution_time(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()  # Record start time
        result = func(*args, **kwargs)
        end_time = time.time()    # Record end time
        execution_time = end_time - start_time
        print(f"Function '{func.__name__}' executed in {execution_time:.6f} seconds")
        return result
    return wrapper

# Sample function to use the decorator on
@log_execution_time
def calculate_sum(n):
    return sum(range(1, n + 1))

# Example usage
total = calculate_sum(1000000)
print("Sum:", total)



Function 'calculate_sum' executed in 0.039543 seconds
Sum: 500000500000


In [3]:
#  Missing Value Handling
# Task: A dataset has missing values in the "income" column. Write code to:

# 1. Replace missing values with the median if the data is normally distributed.

# 2. Replace with the mode if skewed.
# Use Pandas and a skewness threshold of 0.5.

import pandas as pd
import numpy as np


data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank'],
    'income': [50000, 52000, np.nan, 48000, 51000, np.nan]
}

df = pd.DataFrame(data)


skewness = df['income'].skew(skipna=True)
print(f"Skewness of income: {skewness:.2f}")


threshold = 0.5

if abs(skewness) <= threshold:
    median_value = df['income'].median()
    df['income'].fillna(median_value, inplace=True)
    print(f"Filled missing values with median: {median_value}")
else:
    mode_value = df['income'].mode()[0]
    df['income'].fillna(mode_value, inplace=True)
    print(f"Filled missing values with mode: {mode_value}")

# Final output
print("\nUpdated DataFrame:")
print(df)


Skewness of income: -0.75
Filled missing values with mode: 48000.0

Updated DataFrame:
      name   income
0    Alice  50000.0
1      Bob  52000.0
2  Charlie  48000.0
3    David  48000.0
4      Eva  51000.0
5    Frank  48000.0


In [5]:
 # Text Pre-processing
# Task: Clean a text column in a DataFrame by:

# 1. Converting to lowercase.

# 2. Removing special characters (e.g., !, @).

# 3. Tokenizing the text.
import pandas as pd

data = {
    'text': [
        "Hello World!",
        "Pandas is GREAT @ data-cleaning.",
        "Python > Java? Maybe...",
        "E-mail me at: abc@xyz.com!"
    ]
}

df = pd.DataFrame(data)

df['cleaned_text'] = df['text'].str.lower()
                                                                                              
df['cleaned_text'] = df['cleaned_text'].str.replace('[^a-z0-9 ]', '', regex=True)

df['cleaned_tokens'] = df['cleaned_text'].str.split()

print(df[['text', 'cleaned_tokens']])


                               text                     cleaned_tokens
0                      Hello World!                     [hello, world]
1  Pandas is GREAT @ data-cleaning.  [pandas, is, great, datacleaning]
2           Python > Java? Maybe...              [python, java, maybe]
3        E-mail me at: abc@xyz.com!         [email, me, at, abcxyzcom]


In [6]:
# Hyperparameter Tuning
# Task: Use GridSearchCV to find the best max_depth (values: [3, 5, 7]) 
# n_estimators (values: [50, 100]) for a Random Forest classifier.
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

# 1. Load sample dataset
X, y = load_iris(return_X_y=True)

# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. Define the model
rf = RandomForestClassifier(random_state=42)

# 4. Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100]
}

# 5. Setup GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')

# 6. Fit on training data
grid_search.fit(X_train, y_train)

# 7. Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)

# 8. Evaluate on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
  

Best Parameters: {'max_depth': 5, 'n_estimators': 100}
Best CV Accuracy: 0.9428571428571428
Test Accuracy: 1.0


In [8]:
# Custom Evaluation Metric
# Task: Implement a custom metric weighted_accuracy where class 0 has a weight of 1 and class 1 has a weight of 2.

import numpy as np
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

# Custom weighted accuracy function
def weighted_accuracy(y_true, y_pred):
    weights = {0: 1, 1: 2}  # Define class weights
    correct = 0
    total_weight = 0

    for yt, yp in zip(y_true, y_pred):
        weight = weights.get(yt, 1)  # Default weight = 1 for any other class
        if yt == yp:
            correct += weight
        total_weight += weight

    return correct / total_weight if total_weight != 0 else 0

# Create a scorer object for use in GridSearchCV or model evaluation
weighted_accuracy_scorer = make_scorer(weighted_accuracy, greater_is_better=True)
