In [1]:
import pandas as pd
pd.__version__

'3.0.0'

In [4]:
def image_info(dimensions):
    height, width, channels = dimensions
    total_pixels = height * width
    is_grayscale= True if channels == 1 else False
    return {
        "height" : height,
        "width" : width,
        "channels" : channels,
        "total_pixels" : total_pixels,
        "is_grayscale" : is_grayscale
        }
    pass

# Test your solution
print(image_info((224, 224, 3)))
print(image_info((128, 128, 1)))

{'height': 224, 'width': 224, 'channels': 3, 'total_pixels': 50176, 'is_grayscale': False}
{'height': 128, 'width': 128, 'channels': 1, 'total_pixels': 16384, 'is_grayscale': True}


In [5]:
import math

def distance(point1, point2):
    x1, y1= point1
    x2, y2= point2
    dist = math.sqrt((x2-x1)**2 + (y2-y1)**2)
    return round(dist,2)
    pass

# Test your solution
print(distance((0, 0), (3, 4)))
print(distance((1, 2), (4, 6)))


5.0
5.0


In [6]:
def validate_config(config):
    errors = []
    learning_rate, batch_size, epochs, optimizer = config
    if not (0.0001 <= learning_rate <=1.0):
        errors.append("learning rate out of range")
    if batch_size <= 0 or (batch_size & (batch_size -1))!=0:
        errors.append("batch size not power of 2")
    if not isinstance(epochs, int) or epochs <=0:
        errors.append("epochs must be positive")
    if optimizer not in [ "adam" , "sgd" , "rmsprop"]:
        errors.append("invalid optimizer")
        
    is_valid = len(errors)==0
    return (is_valid, errors)
    pass

# Test your solution
print(validate_config((0.001, 32, 100, 'adam')))
print(validate_config((2.0, 30, -5, 'bad_optimizer')))

(True, [])
(False, ['learning rate out of range', 'batch size not power of 2', 'epochs must be positive', 'invalid optimizer'])


In [7]:
def split_data(total_samples, train_ratio, val_ratio, test_ratio):
    train_end= int(total_samples *train_ratio)
    val_end= train_end + int(total_samples* val_ratio)
    test_end = total_samples
    
    train_indices =(0, train_end)
    val_indices= (train_end, val_end)
    test_indices =(val_end, train_end)
    return(train_indices, val_indices, test_indices)
    pass

# Test your solution
print(split_data(100, 0.7, 0.2, 0.1))
print(split_data(1000, 0.8, 0.1, 0.1))

((0, 70), (70, 90), (90, 70))
((0, 800), (800, 900), (900, 800))


In [9]:
def compare_models(models):
    # Dictionaries to store sums and best values
    sums = {
        'accuracy': 0,
        'precision': 0,
        'recall': 0,
        'f1_score': 0
    }
    
    best_models = {
        'accuracy': None,
        'precision': None,
        'recall': None,
        'f1_score': None
    }
    
    best_scores = {
        'accuracy': -1,
        'precision': -1,
        'recall': -1,
        'f1_score': -1
    }
    
    # Loop through models
    for model in models:
        name, acc, prec, rec, f1 = model
        
        sums['accuracy'] += acc
        sums['precision'] += prec
        sums['recall'] += rec
        sums['f1_score'] += f1
        
        if acc > best_scores['accuracy']:
            best_scores['accuracy'] = acc
            best_models['accuracy'] = name
        
        if prec > best_scores['precision']:
            best_scores['precision'] = prec
            best_models['precision'] = name
        
        if rec > best_scores['recall']:
            best_scores['recall'] = rec
            best_models['recall'] = name
        
        if f1 > best_scores['f1_score']:
            best_scores['f1_score'] = f1
            best_models['f1_score'] = name
    
    # Calculate averages
    total_models = len(models)
    averages = {
        metric: round(sums[metric] / total_models, 2)
        for metric in sums
    }
    
    # Rank models by f1 score (descending)
    ranked_by_f1 = sorted(
        models,
        key=lambda x: x[4],
        reverse=True
    )
    
    ranked_by_f1 = [model[0] for model in ranked_by_f1]
    
    return {
        'best_models': best_models,
        'averages': averages,
        'ranked_by_f1': ranked_by_f1
    }
    
    models = [
    ('Model_A', 0.85, 0.82, 0.88, 0.85),
    ('Model_B', 0.92, 0.90, 0.85, 0.87),
    ('Model_C', 0.88, 0.85, 0.92, 0.88)
    ]
    print(compare_models(models))

In [10]:
def find_duplicates(data):
    seen = set()
    duplicates = set()
    for item in data :
        if item in seen:
            duplicates.add(item)
        else:
            seen.add(item)
    return {
        'has_duplicates': len(duplicates) > 0,
        'unique_count': len(seen),
        'duplicate_count': len(duplicates),
        'duplicates': duplicates
    }
    pass

# Test your solution
print(find_duplicates([1, 2, 3, 2, 4, 3, 5]))
print(find_duplicates([1, 2, 3, 4, 5]))

{'has_duplicates': True, 'unique_count': 5, 'duplicate_count': 2, 'duplicates': {2, 3}}
{'has_duplicates': False, 'unique_count': 5, 'duplicate_count': 0, 'duplicates': set()}


In [11]:
def encode_categories(data):
    unique_categories = set(data)
    encoding_map={}
    for index, category in enumerate(unique_categories):
        encoding_map[category] = index
    encoded_data= [encoding_map[item] for item in data]
    return {
        'encoding_map': encoding_map,
        'enconded_data' : encoded_data,
        'num_categories' : len(unique_categories)
    }

    pass

# Test your solution
print(encode_categories(['cat', 'dog', 'cat', 'bird', 'dog', 'cat']))

{'encoding_map': {'cat': 0, 'dog': 1, 'bird': 2}, 'enconded_data': [0, 1, 0, 2, 1, 0], 'num_categories': 3}


In [12]:
def analyze_overlap(train_ids, test_ids):
    train_set = set(train_ids)
    test_set = set(test_ids)
    
    overlap = train_set & test_set
    only_train = train_set - test_set
    only_test = test_set - train_set
    total_unique = len(train_set | test_set)
    
    return {
        'total_unique': total_unique,
        'overlap': overlap,
        'only_train': only_train,
        'only_test': only_test,
        'has_leakage': len(overlap) > 0
    }
    pass

# Test your solution
print(analyze_overlap([1, 2, 3, 4, 5], [4, 5, 6, 7, 8]))

{'total_unique': 8, 'overlap': {4, 5}, 'only_train': {1, 2, 3}, 'only_test': {8, 6, 7}, 'has_leakage': True}


In [13]:
def remove_stopwords(text, stopwords):
    # Step 1: Split text into words
    original_words = text.split()
    
    # Step 2: Filter out stopwords
    filtered_words = []
    for word in original_words:
        if word not in stopwords:
            filtered_words.append(word)
    
    # Step 3: Count removed words
    removed_count = len(original_words) - len(filtered_words)
    
    # Step 4: Get unique filtered words
    unique_filtered = set(filtered_words)
    
    return {
        'original_words': original_words,
        'filtered_words': filtered_words,
        'removed_count': removed_count,
        'unique_filtered': unique_filtered
    }
    pass

# Test your solution
stopwords = {'the', 'is', 'a', 'an', 'in', 'on', 'at'}
text = "the cat is on the mat in the house"
print(remove_stopwords(text, stopwords))

{'original_words': ['the', 'cat', 'is', 'on', 'the', 'mat', 'in', 'the', 'house'], 'filtered_words': ['cat', 'mat', 'house'], 'removed_count': 6, 'unique_filtered': {'house', 'cat', 'mat'}}


In [15]:
def filter_correlated(correlations, threshold):
    highly_correlated_pairs = set()
    
    # Step 1: Find highly correlated pairs
    for pair, corr in correlations.items():
        if corr >= threshold:
            highly_correlated_pairs.add(pair)
    
    # Step 2: Decide which features to remove
    features_to_remove = set()
    for f1, f2 in highly_correlated_pairs:
        features_to_remove.add(f2)
    
    # Step 3: Collect all features
    all_features = set()
    for f1, f2 in correlations.keys():
        all_features.add(f1)
        all_features.add(f2)
    
    # Step 4: Features to keep
    features_to_keep = all_features - features_to_remove
    
    return {
        'highly_correlated_pairs': highly_correlated_pairs,
        'features_to_remove': features_to_remove,
        'features_to_keep': features_to_keep
    }
    pass

# Test your solution
correlations = {
    ('f1', 'f2'): 0.95,
    ('f1', 'f3'): 0.5,
    ('f2', 'f4'): 0.92,
    ('f3', 'f4'): 0.3
}
print(filter_correlated(correlations, threshold=0.9))

{'highly_correlated_pairs': {('f2', 'f4'), ('f1', 'f2')}, 'features_to_remove': {'f2', 'f4'}, 'features_to_keep': {'f1', 'f3'}}


In [16]:
from itertools import product

def generate_grid(params):
    keys = list(params.keys())
    values = list(params.values())
    
    combinations = product(*values)
    
    result = []
    for combo in combinations:
        combo_dict = dict(zip(keys, combo))
        result.append(combo_dict)
    
    return result
    pass

# Test your solution
params = {
    'learning_rate': [0.01, 0.1],
    'batch_size': [32, 64],
    'epochs': [10, 20]
}
result = generate_grid(params)
print(f"Total combinations: {len(result)}")
print("First 3:", result[:3])

Total combinations: 8
First 3: [{'learning_rate': 0.01, 'batch_size': 32, 'epochs': 10}, {'learning_rate': 0.01, 'batch_size': 32, 'epochs': 20}, {'learning_rate': 0.01, 'batch_size': 64, 'epochs': 10}]


In [20]:
import statistics

def calculate_stats(data):
     result = {}
    
     for feature, values in data.items():
        sorted_vals = sorted(values)
        n = len(sorted_vals)
        
        mean = statistics.mean(sorted_vals)
        median = statistics.median(sorted_vals)
        std = statistics.stdev(sorted_vals)
        min_val = min(sorted_vals)
        max_val = max(sorted_vals)
        
        # Quartiles
        q1 = statistics.median(sorted_vals[:n//2])
        if n % 2 == 0:
            q3 = statistics.median(sorted_vals[n//2:])
        else:
            q3 = statistics.median(sorted_vals[n//2+1:])
        
        result[feature] = {
            'mean': mean,
            'median': median,
            'std': round(std, 2),
            'min': min_val,
            'max': max_val,
            'q1': q1,
            'q3': q3
        }
    
     return result
     pass

# Test your solution
data = {
    'age': [25, 30, 35, 40, 45],
    'income': [50000, 60000, 70000, 80000, 90000]
}
print(calculate_stats(data))

{'age': {'mean': 35, 'median': 35, 'std': 7.91, 'min': 25, 'max': 45, 'q1': 27.5, 'q3': 42.5}, 'income': {'mean': 70000, 'median': 70000, 'std': 15811.39, 'min': 50000, 'max': 90000, 'q1': 55000.0, 'q3': 85000.0}}


In [21]:
def aggregate_data(data):
    result = {}
    
    for item in data:
        category = item['category']
        value = item['value']
        
        if category not in result:
            result[category] = {
                'count': 0,
                'sum': 0,
                'values_list': []
            }
        
        result[category]['count'] += 1
        result[category]['sum'] += value
        result[category]['values_list'].append(value)
    
    # Calculate average
    for category in result:
        result[category]['avg'] = round(
            result[category]['sum'] / result[category]['count'], 2
        )
    
    return result
    pass

# Test your solution
data = [
    {'category': 'A', 'value': 10},
    {'category': 'B', 'value': 20},
    {'category': 'A', 'value': 15},
    {'category': 'B', 'value': 25},
    {'category': 'A', 'value': 12}
]
print(aggregate_data(data))

{'A': {'count': 3, 'sum': 37, 'values_list': [10, 15, 12], 'avg': 12.33}, 'B': {'count': 2, 'sum': 45, 'values_list': [20, 25], 'avg': 22.5}}


In [23]:
def build_confusion_matrix(actual, predicted):
      matrix = {}
      classes = set(actual) | set(predicted)
    
    # Initialize matrix
      for a in classes:
        matrix[a] = {}
        for p in classes:
            matrix[a][p] = 0
    
    # Fill matrix
      for a, p in zip(actual, predicted):
        matrix[a][p] += 1
    
    # Accuracy
      correct = sum(matrix[c][c] for c in classes)
      total = len(actual)
      accuracy = correct / total
    
    # Per-class metrics
      per_class = {}
    
      for c in classes:
        tp = matrix[c][c]
        fp = sum(matrix[other][c] for other in classes if other != c)
        fn = sum(matrix[c][other] for other in classes if other != c)
        
        precision = tp / (tp + fp) if (tp + fp) != 0 else 0
        recall = tp / (tp + fn) if (tp + fn) != 0 else 0
        f1 = (2 * precision * recall / (precision + recall)
              if (precision + recall) != 0 else 0)
        
        per_class[c] = {
            'precision': round(precision, 2),
            'recall': round(recall, 2),
            'f1': round(f1, 2)
        }
    
      return {
        'matrix': matrix,
        'accuracy': round(accuracy, 3),
        'per_class': per_class
    }
      pass

# Test your solution
actual = ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'C']
predicted = ['A', 'B', 'B', 'C', 'A', 'A', 'C', 'B']
print(build_confusion_matrix(actual, predicted))

{'matrix': {'C': {'C': 2, 'A': 0, 'B': 1}, 'A': {'C': 0, 'A': 2, 'B': 1}, 'B': {'C': 0, 'A': 1, 'B': 1}}, 'accuracy': 0.625, 'per_class': {'C': {'precision': 1.0, 'recall': 0.67, 'f1': 0.8}, 'A': {'precision': 0.67, 'recall': 0.67, 'f1': 0.67}, 'B': {'precision': 0.33, 'recall': 0.5, 'f1': 0.4}}}


In [24]:
import time

def timer(func):
      def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        end = time.time()
        
        execution_time = (end - start) * 1000  # milliseconds
        print(f"{func.__name__} executed in {execution_time:.2f} ms")
        
        return result
      return wrapper
      pass

# Test your decorator
@timer
def slow_function(n):
    time.sleep(n)
    return n * 2

result = slow_function(0.5)
print(f"Result: {result}")

slow_function executed in 505.07 ms
Result: 1.0


In [26]:
def memoize(func):
     cache = {}
     hits = 0
     misses = 0
    
     def wrapper(*args):
        nonlocal hits, misses
        
        if args in cache:
            hits += 1
            return cache[args]
        else:
            misses += 1
            result = func(*args)
            cache[args] = result
            return result
    
     def cache_info():
        return {
            'hits': hits,
            'misses': misses,
            'size': len(cache)
        }
    
     wrapper.cache_info = cache_info
     return wrapper
     pass

# Test your decorator
@memoize
def fibonacci(n):
    if n <= 1:
        return n
    return fibonacci(n-1) + fibonacci(n-2)

print("First call:", fibonacci(10))
print("Second call:", fibonacci(10))
print("Cache info:", fibonacci.cache_info())

First call: 55
Second call: 55
Cache info: {'hits': 9, 'misses': 11, 'size': 11}


In [32]:
def create_metric(metric_name):
        def accuracy(y_true, y_pred):
            correct = 0
        for t, p in zip(y_true, y_pred):
            if t == p:
                correct += 1
        return correct / len(y_true)
     
        def precision(y_true, y_pred):
            tp = 0
            fp = 0
        for t, p in zip(y_true, y_pred):
            if p == 1 and t == 1:
                tp += 1
            elif p == 1 and t == 0:
                fp += 1
        return tp / (tp + fp) if (tp + fp) != 0 else 0
    
        def recall(y_true, y_pred):
            tp = 0
            fn = 0
        for t, p in zip(y_true, y_pred):
            if t == 1 and p == 1:
                tp += 1
            elif t == 1 and p == 0:
                fn += 1
        return tp / (tp + fn) if (tp + fn) != 0 else 0
    
        def f1(y_true, y_pred):
            p = precision(y_true, y_pred)
            r = recall(y_true, y_pred)
        return 2 * p * r / (p + r) if (p + r) != 0 else 0
    
        if metric_name == 'accuracy':
            return accuracy
        elif metric_name == 'precision':
            return precision
        elif metric_name == 'recall':
            return recall
        elif metric_name == 'f1':
            return f1
        else:
            raise ValueError("Unsupported metric")
        pass

# Test your solution
accuracy_fn = create_metric('accuracy')
precision_fn = create_metric('precision')

y_true = [1, 0, 1, 1, 0]
y_pred = [1, 0, 1, 0, 0]

print("Accuracy:", accuracy_fn(y_true, y_pred))
print("Precision:", precision_fn(y_true, y_pred))

NameError: name 'y_true' is not defined

In [33]:
#21
def transform_data(data):
    ints = list(map(lambda x: int(x), data))
    evens = list(filter(lambda x: x % 2 == 0, ints))
    squares = list(map(lambda x: x ** 2, evens))
    return sum(squares)
    pass

# Test your solution
result = transform_data(['1', '2', '3', '4', '5', '6'])
print(f"Result: {result}")

Result: 56


In [34]:
import statistics

def scale_features(data, method):
    if method == 'minmax':
        min_val = min(data)
        max_val = max(data)
        return [(x - min_val) / (max_val - min_val) for x in data]
    
    elif method == 'zscore':
        mean = statistics.mean(data)
        std = statistics.stdev(data)
        return [(x - mean) / std for x in data]
    
    elif method == 'robust':
        median = statistics.median(data)
        q1 = statistics.median(sorted(data)[:len(data)//2])
        q3 = statistics.median(sorted(data)[len(data)//2 + 1:])
        iqr = q3 - q1
        return [(x - median) / iqr for x in data]
    
    else:
        raise ValueError("Invalid scaling method")
    pass

# Test your solution
data = [1, 2, 3, 4, 5]
print("Min-Max:", scale_features(data, 'minmax'))
print("Z-score:", scale_features(data, 'zscore'))
print("Robust:", scale_features(data, 'robust'))

Min-Max: [0.0, 0.25, 0.5, 0.75, 1.0]
Z-score: [-1.2649110640673518, -0.6324555320336759, 0.0, 0.6324555320336759, 1.2649110640673518]
Robust: [-0.6666666666666666, -0.3333333333333333, 0.0, 0.3333333333333333, 0.6666666666666666]


In [36]:
from functools import reduce

def product(numbers):
      return reduce(lambda x, y: x * y, numbers)
      pass

def maximum(numbers):
    return reduce(lambda x, y: x if x > y else y, numbers)
    pass

def join_strings(strings, separator):
    return reduce(lambda x, y: x + separator + y, strings)
    pass

def flatten(nested_list):
    return reduce(lambda x, y: x + y, nested_list)
    pass

# Test your solutions
print("Product:", product([1, 2, 3, 4, 5]))
print("Maximum:", maximum([3, 1, 4, 1, 5, 9, 2, 6]))
print("Join:", join_strings(['Hello', 'World', 'ML'], ' '))
print("Flatten:", flatten([[1, 2], [3, 4], [5, 6]]))

Product: 120
Maximum: 9
Join: Hello World ML
Flatten: [1, 2, 3, 4, 5, 6]


In [37]:
def transpose(matrix):
    return [[row[i] for row in matrix] for i in range(len(matrix[0]))]
    pass

def flatten_even(matrix):
    return [x for row in matrix for x in row if x % 2 == 0]
    pass

def cartesian(list1, list2, threshold=5):
    return [(x, y) for x in list1 for y in list2 if x + y > threshold]
    pass

def invert_dict(d):
    return {v: [k for k in d if d[k] == v] for v in set(d.values())}
    pass

# Test your solutions
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
print("Transpose:", transpose(matrix))
print("Flatten even:", flatten_even(matrix))
print("Cartesian:", cartesian([1, 2, 3], [4, 5, 6]))
print("Invert:", invert_dict({'a': 1, 'b': 2, 'c': 1}))

Transpose: [[1, 4, 7], [2, 5, 8], [3, 6, 9]]
Flatten even: [2, 4, 6, 8]
Cartesian: [(1, 5), (1, 6), (2, 4), (2, 5), (2, 6), (3, 4), (3, 5), (3, 6)]
Invert: {1: ['a', 'c'], 2: ['b']}


In [39]:
import pandas as pd
import numpy as np

def series_stats(data):
     s = pd.Series(data)
    
     return {
        'mean': s.mean(),
        'median': s.median(),
        'std': s.std(),
        'var': s.var(),
        'min': s.min(),
        'max': s.max(),
        'q1': s.quantile(0.25),
        'q3': s.quantile(0.75),
        'skew': s.skew()
    }
     pass

# Test your solution
result = series_stats([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
print(result)

{'mean': np.float64(5.5), 'median': np.float64(5.5), 'std': np.float64(3.0276503540974917), 'var': np.float64(9.166666666666666), 'min': np.int64(1), 'max': np.int64(10), 'q1': np.float64(3.25), 'q3': np.float64(7.75), 'skew': np.float64(0.0)}


In [40]:
import pandas as pd

def create_dataframe(data, columns=None):
    if isinstance(data, dict):
        return pd.DataFrame(data)
    
    # List of dictionaries
    elif isinstance(data, list) and isinstance(data[0], dict):
        return pd.DataFrame(data)
    
    # List of tuples (needs columns)
    elif isinstance(data, list) and isinstance(data[0], tuple):
        return pd.DataFrame(data, columns=columns)
    
    else:
        raise ValueError("Unsupported data format")
    pass

# Test your solution
print("Method 1: Dict of lists")
print(create_dataframe({'A': [1, 2, 3], 'B': [4, 5, 6]}))

print("\nMethod 2: List of dicts")
print(create_dataframe([{'A': 1, 'B': 4}, {'A': 2, 'B': 5}, {'A': 3, 'B': 6}]))

Method 1: Dict of lists
   A  B
0  1  4
1  2  5
2  3  6

Method 2: List of dicts
   A  B
0  1  4
1  2  5
2  3  6


In [41]:
import pandas as pd

def create_with_index(data, index_type, custom_index=None):
    df = pd.DataFrame(data)
    n = len(df)
    
    if index_type == 'numeric':
        index = list(range(100, 100 + 5*n, 5))
    
    elif index_type == 'date':
        index = pd.date_range(start='2024-01-01', periods=n, freq='D')
    
    elif index_type == 'custom':
        if custom_index is None:
            raise ValueError("custom_index must be provided for custom type")
        index = custom_index
    
    else:
        raise ValueError("Invalid index type")
    
    df.index = index
    return df
    pass

# Test your solution
data = {'Score': [85, 90, 78, 92, 88]}

print("Numeric index:")
print(create_with_index(data, 'numeric'))

print("\nDate index:")
print(create_with_index(data, 'date'))

print("\nCustom index:")
print(create_with_index(data, 'custom', custom_index=['A', 'B', 'C', 'D', 'E']))

Numeric index:
     Score
100     85
105     90
110     78
115     92
120     88

Date index:
            Score
2024-01-01     85
2024-01-02     90
2024-01-03     78
2024-01-04     92
2024-01-05     88

Custom index:
   Score
A     85
B     90
C     78
D     92
E     88


In [42]:
import pandas as pd

def extract_info(df):
    info = {}
    
    info['shape'] = df.shape
    info['columns'] = list(df.columns)
    info['dtypes'] = df.dtypes.astype(str).to_dict()
    info['missing_values'] = df.isnull().sum().to_dict()
    info['unique_counts'] = df.nunique().to_dict()
    info['memory_mb'] = df.memory_usage(deep=True).sum() / (1024 ** 2)
    
    # Numeric summary
    numeric_cols = df.select_dtypes(include='number')
    info['numeric_summary'] = numeric_cols.describe().to_dict()
    
    return info
    pass

# Test your solution
df = pd.DataFrame({
    'Age': [25, 30, None, 35, 40],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Salary': [50000, 60000, 70000, None, 90000]
})

info = extract_info(df)
print("DataFrame Info:")
for key, value in info.items():
    print(f"{key}: {value}")

DataFrame Info:
shape: (5, 3)
columns: ['Age', 'Name', 'Salary']
dtypes: {'Age': 'float64', 'Name': 'str', 'Salary': 'float64'}
missing_values: {'Age': 1, 'Name': 0, 'Salary': 1}
unique_counts: {'Age': 4, 'Name': 5, 'Salary': 4}
memory_mb: 0.000457763671875
numeric_summary: {'Age': {'count': 4.0, 'mean': 32.5, 'std': 6.454972243679028, 'min': 25.0, '25%': 28.75, '50%': 32.5, '75%': 36.25, 'max': 40.0}, 'Salary': {'count': 4.0, 'mean': 67500.0, 'std': 17078.25127659933, 'min': 50000.0, '25%': 57500.0, '50%': 65000.0, '75%': 75000.0, 'max': 90000.0}}


In [43]:
#31
import pandas as pd

def select_columns(df, method='all', pattern=None, dtype=None):
    if method == 'all':
        return df.copy()
    
    elif method == 'numeric':
        return df.select_dtypes(include='number')
    
    elif method == 'dtype':
        return df.select_dtypes(include=dtype)
    
    elif method == 'contains':
        return df[[col for col in df.columns if pattern in col]]
    
    elif method == 'startswith':
        return df[[col for col in df.columns if col.startswith(pattern)]]
    
    elif method == 'endswith':
        return df[[col for col in df.columns if col.endswith(pattern)]]
    
    elif method == 'mean_gt':
        # select numeric columns whose mean > threshold
        numeric_df = df.select_dtypes(include='number')
        cols = numeric_df.columns[numeric_df.mean() > threshold]
        return df[cols]
    
    else:
        raise ValueError("Invalid method")
    pass

# Test your solution
df = pd.DataFrame({
    'age': [25, 30, 35],
    'name': ['Alice', 'Bob', 'Charlie'],
    'salary': [50000, 60000, 70000],
    'bonus': [5000, 6000, 7000]
})

print("Numeric columns:")
print(select_columns(df, method='numeric'))

print("\nColumns containing 'sal':")
print(select_columns(df, method='contains', pattern='sal'))

Numeric columns:
   age  salary  bonus
0   25   50000   5000
1   30   60000   6000
2   35   70000   7000

Columns containing 'sal':
   salary
0   50000
1   60000
2   70000


In [45]:
#32
import pandas as pd

def filter_rows(df, conditions, logic='AND'):
    masks = []

    for col, (op, val) in conditions.items():
        if op == '>':
            mask = df[col] > val
        elif op == '<':
            mask = df[col] < val
        elif op == '>=':
            mask = df[col] >= val
        elif op == '<=':
            mask = df[col] <= val
        elif op == '==':
            mask = df[col] == val
        elif op == '!=':
            mask = df[col] != val
        elif op == 'between':
            mask = df[col].between(val[0], val[1])
        elif op == 'in':
            mask = df[col].isin(val)
        elif op == 'contains':
            mask = df[col].str.contains(val)
        else:
            raise ValueError("Invalid operator")

        masks.append(mask)

    if logic == 'AND':
        final_mask = masks[0]
        for m in masks[1:]:
            final_mask = final_mask & m
    else:  # OR
        final_mask = masks[0]
        for m in masks[1:]:
            final_mask = final_mask | m

    return df[final_mask]
    pass

# Test your solution
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [25, 30, 35, 40],
    'city': ['NYC', 'LA', 'NYC', 'Chicago'],
    'salary': [50000, 60000, 70000, 80000]
})

print("Age > 30:")
print(filter_rows(df, {'age': ('>', 30)}, logic='AND'))

print("\nAge > 30 AND city == NYC:")
print(filter_rows(df, {'age': ('>', 30), 'city': ('==', 'NYC')}, logic='AND'))

Age > 30:
      name  age     city  salary
2  Charlie   35      NYC   70000
3    David   40  Chicago   80000

Age > 30 AND city == NYC:
      name  age city  salary
2  Charlie   35  NYC   70000


In [46]:
#33
import pandas as pd

def select_data(df, method='loc', rows=None, cols=None):
    if method == 'loc':
        if rows is None and cols is None:
            return df.loc[:]
        elif rows is None:
            return df.loc[:, cols]
        elif cols is None:
            return df.loc[rows, :]
        else:
            return df.loc[rows, cols]
    
    elif method == 'iloc':
        if rows is None and cols is None:
            return df.iloc[:]
        elif rows is None:
            return df.iloc[:, cols]
        elif cols is None:
            return df.iloc[rows, :]
        else:
            return df.iloc[rows, cols]
    
    else:
        raise ValueError("method must be 'loc' or 'iloc'")
    pass

# Test your solution
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': [100, 200, 300, 400, 500]
}, index=['row1', 'row2', 'row3', 'row4', 'row5'])

print("Using .loc:")
print(select_data(df, method='loc', rows=['row1', 'row3'], cols=['A', 'C']))

print("\nUsing .iloc:")
print(select_data(df, method='iloc', rows=[0, 2], cols=[0, 2]))

Using .loc:
      A    C
row1  1  100
row3  3  300

Using .iloc:
      A    C
row1  1  100
row3  3  300


In [47]:
#34
import pandas as pd

def filter_with_query(df, query_string):
    return df.query(query_string)
    pass

# Test your solution
df = pd.DataFrame({
    'age': [25, 30, 35, 40, 45],
    'salary': [50000, 60000, 70000, 80000, 90000],
    'department': ['IT', 'HR', 'IT', 'Sales', 'IT']
})

threshold = 65000

print("Age > 30 and salary > threshold:")
print(filter_with_query(df, "age > 30 and salary > @threshold"))

print("\nDepartment == 'IT':")
print(filter_with_query(df, "department == 'IT'"))

Age > 30 and salary > threshold:
   age  salary department
2   35   70000         IT
3   40   80000      Sales
4   45   90000         IT

Department == 'IT':
   age  salary department
0   25   50000         IT
2   35   70000         IT
4   45   90000         IT


In [48]:
#35
import pandas as pd

def select_top_n(df, n=5, column=None, columns=None, method='largest', keep='all'):
    # If percentage is given (like 0.2 for top 20%)
    if isinstance(n, float):
        n = math.ceil(len(df) * n)
    
    # Single column case → use nlargest / nsmallest
    if column is not None:
        if method == 'largest':
            return df.nlargest(n, column, keep=keep)
        else:
            return df.nsmallest(n, column, keep=keep)
    
    # Multiple columns → use sort_values
    elif columns is not None:
        ascending = [False] * len(columns) if method == 'largest' else [True] * len(columns)
        return df.sort_values(by=columns, ascending=ascending).head(n)
    
    else:
        raise ValueError("Provide either 'column' or 'columns'")
    pass

# Test your solution
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'score': [85, 92, 88, 92, 78],
    'time': [120, 100, 110, 95, 130]
})

print("Top 2 by score:")
print(select_top_n(df, n=2, column='score', method='largest'))

print("\nBottom 2 by time:")
print(select_top_n(df, n=2, column='time', method='smallest'))

Top 2 by score:
    name  score  time
1    Bob     92   100
3  David     92    95

Bottom 2 by time:
    name  score  time
3  David     92    95
1    Bob     92   100


In [49]:
#36
import pandas as pd

def group_and_aggregate(df, groupby, agg_col, agg_func='mean'):
    grouped = df.groupby(groupby)[agg_col]
    
    if agg_func == 'mean':
        result = grouped.mean()
    elif agg_func == 'sum':
        result = grouped.sum()
    elif agg_func == 'median':
        result = grouped.median()
    elif agg_func == 'count':
        result = grouped.count()
    elif agg_func == 'std':
        result = grouped.std()
    elif agg_func == 'min':
        result = grouped.min()
    elif agg_func == 'max':
        result = grouped.max()
    else:
        # custom function support
        result = grouped.apply(agg_func)
    
    return result.to_dict()
    pass

# Test your solution
df = pd.DataFrame({
    'category': ['A', 'B', 'A', 'B', 'A', 'C'],
    'value': [10, 20, 30, 40, 50, 60],
    'count': [1, 2, 3, 4, 5, 6]
})

print("Mean by category:")
print(group_and_aggregate(df, groupby='category', agg_col='value', agg_func='mean'))

print("\nSum by category:")
print(group_and_aggregate(df, groupby='category', agg_col='value', agg_func='sum'))

Mean by category:
{'A': 30.0, 'B': 30.0, 'C': 60.0}

Sum by category:
{'A': 90, 'B': 60, 'C': 60}


In [50]:
#37
import pandas as pd

def multi_agg(df, groupby):
    result = df.groupby(groupby).agg({
        'salary': ['mean', 'max', 'min'],
        'experience': ['mean', 'max']
    })
    return result
    pass

# Test your solution
df = pd.DataFrame({
    'department': ['IT', 'HR', 'IT', 'HR', 'Sales'],
    'employee': ['A', 'B', 'C', 'D', 'E'],
    'salary': [50000, 45000, 55000, 48000, 52000],
    'experience': [5, 3, 7, 4, 6]
})

result = multi_agg(df, groupby='department')
print(result)

             salary               experience    
               mean    max    min       mean max
department                                      
HR          46500.0  48000  45000        3.5   4
IT          52500.0  55000  50000        6.0   7
Sales       52000.0  52000  52000        6.0   6


In [52]:
import pandas as pd

def group_normalize(df, groupby, column):
    df = df.copy()
    
    group_mean = df.groupby(groupby)[column].transform('mean')
    group_std = df.groupby(groupby)[column].transform('std')
    
    df['normalized'] = (df[column] - group_mean) / group_std
    return df
    pass

# Test your solution
df = pd.DataFrame({
    'group': ['A', 'A', 'B', 'B', 'C', 'C'],
    'value': [10, 20, 30, 40, 50, 60]
})

result = group_normalize(df, groupby='group', column='value')
print(result)

  group  value  normalized
0     A     10   -0.707107
1     A     20    0.707107
2     B     30   -0.707107
3     B     40    0.707107
4     C     50   -0.707107
5     C     60    0.707107


In [55]:
import pandas as pd

def create_pivot(df, index, columns, values, aggfunc='sum', fill_value=None):
    pivot = pd.pivot_table(
        df,
        index=index,
        columns=columns,
        values=values,
        aggfunc=aggfunc,
        fill_value=fill_value
    )
    return pivot
    pass

# Test your solution
df = pd.DataFrame({
    'date': ['2024-01', '2024-01', '2024-02', '2024-02'],
    'product': ['A', 'B', 'A', 'B'],
    'sales': [100, 150, 120, 180],
    'quantity': [10, 15, 12, 18]
})

print("Pivot table:")
print(create_pivot(df, index='date', columns='product', values='sales'))

Pivot table:
product    A    B
date             
2024-01  100  150
2024-02  120  180


In [58]:
#40
import pandas as pd

def advanced_groupby(df, groupby, operations):
       result = df.copy()
    
     
       if 'filter_size' in operations:
           min_size = operations['filter_size']
           result = result.groupby(groupby).filter(lambda x: len(x) >= min_size)
    
    # 2. Rank within groups
       if operations.get('rank', False):
            result['rank'] = result.groupby(groupby)['sales'] \
                                .rank(method='dense', ascending=False)
    
    # 3. Percentage within groups
       if operations.get('pct', False):
            group_sum = result.groupby(groupby)['sales'].transform('sum')
            result['pct'] = result['sales'] / group_sum * 100
    
    # 4. Top N within each group
       if 'top_n' in operations:
            n = operations['top_n']
            result = result.sort_values(['store', 'sales'], ascending=[True, False])
            result = result.groupby(groupby).head(n)
    
       return result
       pass

# Test your solution
df = pd.DataFrame({
    'store': ['A', 'A', 'A', 'B', 'B', 'C', 'C', 'C', 'C'],
    'product': ['X', 'Y', 'Z', 'X', 'Y', 'X', 'Y', 'Z', 'W'],
    'sales': [100, 150, 200, 120, 180, 90, 110, 130, 140]
})

operations = {'filter_size': 3, 'top_n': 2, 'rank': True}
result = advanced_groupby(df, groupby='store', operations=operations)
print(result)

  store product  sales  rank
2     A       Z    200   1.0
1     A       Y    150   2.0
8     C       W    140   1.0
7     C       Z    130   2.0


In [60]:
#41
import pandas as pd
import numpy as np

def analyze_missing(df):
    total_missing = df.isna().sum().sum()
    
    by_column = df.isna().sum().to_dict()
    
    percentage = (df.isna().mean() * 100).round(2).to_dict()
    
    rows_with_missing = df.index[df.isna().any(axis=1)].tolist()
    
    complete_rows = df.dropna().shape[0]
    
    return {
        'total_missing': int(total_missing),
        'by_column': by_column,
        'percentage': percentage,
        'rows_with_missing': rows_with_missing,
        'complete_rows': complete_rows
    }
    pass

# Test your solution
df = pd.DataFrame({
    'A': [1, 2, None, 4, 5],
    'B': [None, 2, 3, None, 5],
    'C': [1, 2, 3, 4, 5]
})

result = analyze_missing(df)
print("Missing Data Analysis:")
for key, value in result.items():
    print(f"{key}: {value}")

Missing Data Analysis:
total_missing: 3
by_column: {'A': 1, 'B': 2, 'C': 0}
percentage: {'A': 20.0, 'B': 40.0, 'C': 0.0}
rows_with_missing: [0, 2, 3]
complete_rows: 2


In [61]:
#42
import pandas as pd
import numpy as np

def impute_missing(df, column, method='mean', groupby=None):
    df = df.copy()
    
    if method == 'mean':
        df[column] = df[column].fillna(df[column].mean())
    
    elif method == 'median':
        df[column] = df[column].fillna(df[column].median())
    
    elif method == 'mode':
        df[column] = df[column].fillna(df[column].mode()[0])
    
    elif method == 'ffill':
        df[column] = df[column].fillna(method='ffill')
    
    elif method == 'bfill':
        df[column] = df[column].fillna(method='bfill')
    
    elif method == 'interpolate':
        df[column] = df[column].interpolate()
    
    elif method == 'group_mean':
        if groupby is None:
            raise ValueError("groupby must be provided for group_mean")
        df[column] = df[column].fillna(
            df.groupby(groupby)[column].transform('mean')
        )
    
    else:
        raise ValueError("Invalid method")
    
    return df
    pass

# Test your solution
df = pd.DataFrame({
    'group': ['A', 'A', 'A', 'B', 'B', 'B'],
    'value': [10.0, None, 30.0, 40.0, None, 60.0]
})

print("Original:")
print(df)

print("\nImpute with mean:")
print(impute_missing(df, column='value', method='mean'))

print("\nImpute with group mean:")
print(impute_missing(df, column='value', method='group_mean', groupby='group'))

Original:
  group  value
0     A   10.0
1     A    NaN
2     A   30.0
3     B   40.0
4     B    NaN
5     B   60.0

Impute with mean:
  group  value
0     A   10.0
1     A   35.0
2     A   30.0
3     B   40.0
4     B   35.0
5     B   60.0

Impute with group mean:
  group  value
0     A   10.0
1     A   20.0
2     A   30.0
3     B   40.0
4     B   50.0
5     B   60.0


In [62]:
import pandas as pd
import numpy as np

def drop_missing(df, axis=0, how='any', thresh=None, subset=None):
    return df.dropna(axis=axis, how=how, thresh=thresh, subset=subset)
    pass

# Test your solution
df = pd.DataFrame({
    'A': [1, None, None, 4],
    'B': [None, None, None, None],
    'C': [1, 2, 3, 4],
    'D': [1, 2, None, 4]
})

print("Original:")
print(df)

print("\nDrop rows with any missing:")
print(drop_missing(df, axis=0, how='any'))

print("\nDrop columns with all missing:")
print(drop_missing(df, axis=1, how='all'))

Original:
     A     B  C    D
0  1.0  None  1  1.0
1  NaN  None  2  2.0
2  NaN  None  3  NaN
3  4.0  None  4  4.0

Drop rows with any missing:


TypeError: You cannot set both the how and thresh arguments at the same time.

In [63]:
#44
import pandas as pd

def merge_dataframes(df1, df2, on=None, how='inner', left_on=None, right_on=None):
    return pd.merge(
        df1,
        df2,
        on=on,
        how=how,
        left_on=left_on,
        right_on=right_on
    )
    pass

# Test your solution
df1 = pd.DataFrame({
    'key': ['A', 'B', 'C'],
    'value1': [1, 2, 3]
})

df2 = pd.DataFrame({
    'key': ['B', 'C', 'D'],
    'value2': [4, 5, 6]
})

print("Inner join:")
print(merge_dataframes(df1, df2, on='key', how='inner'))

print("\nLeft join:")
print(merge_dataframes(df1, df2, on='key', how='left'))

print("\nOuter join:")
print(merge_dataframes(df1, df2, on='key', how='outer'))

Inner join:
  key  value1  value2
0   B       2       4
1   C       3       5

Left join:
  key  value1  value2
0   A       1     NaN
1   B       2     4.0
2   C       3     5.0

Outer join:
  key  value1  value2
0   A     1.0     NaN
1   B     2.0     4.0
2   C     3.0     5.0
3   D     NaN     6.0


In [64]:
#45
import pandas as pd

def concat_dataframes(df1, df2, axis=0, ignore_index=False):
    return pd.concat([df1, df2], axis=axis, ignore_index=ignore_index)
    pass

# Test your solution
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})

print("Vertical concatenation:")
print(concat_dataframes(df1, df2, axis=0))

print("\nHorizontal concatenation:")
print(concat_dataframes(df1, df2, axis=1))

Vertical concatenation:
   A  B
0  1  3
1  2  4
0  5  7
1  6  8

Horizontal concatenation:
   A  B  A  B
0  1  3  5  7
1  2  4  6  8
