In [None]:
from faker import Faker
import numpy as np
import pandas as pd
import random

fake = Faker()

num_students = 100

data = []
for _ in range(num_students):
    name = fake.name()
    age = random.randint(18, 25)
    
    if "John" in name:
        name = f"{name} {age}"
        age = None 
        
    data.append({
        'student_id': fake.uuid4(),
        'name': name,
        'age': age,
        'gender': fake.random_element(['Male', 'Female']),
        'grade': random.choice(['A', 'B', 'C', 'D', 'F']),
        'major': fake.random_element(['Math', 'Science', 'History', 'Art']),
        'gpa': round(random.uniform(2.5, 4.0), 2),
        'attendance_rate': random.randint(70, 100),
        'extracurricular_activities': fake.random_element(['Sports', 'Music', 'Debate', 'Clubs', 'None']),
        'study_hours_per_week': random.randint(10, 30),
        'sleep_hours_per_day': random.uniform(5.0, 9.0),
        'distance_from_home_km': round(random.uniform(1.0, 30.0), 2)
    })

df = pd.DataFrame(data)
df.to_csv('student_data.csv', index=False)

# Ввести проблемы качества данных для предварительной обработки данных
# Имитация пропущенных значений
df.loc[df.sample(frac=0.1).index, 'age'] = None
df.loc[df.sample(frac=0.1).index, 'gpa'] = np.nan
df.loc[df.sample(frac=0.05).index, 'gender'] = None

# Ввести выбросы
df.loc[df.sample(frac=0.02).index, 'gpa'] = 5.0
df.loc[df.sample(frac=0.02).index, 'age'] = 80

# Ввести повторяющиеся записи
duplicate_indices = df.sample(frac=0.03).index
df = df.append(df.loc[duplicate_indices], ignore_index=True)

# Неверные типы данных
df['study_hours_per_week'] = df['study_hours_per_week'].astype(str)

# Значения вне допустимого диапазона
df.loc[df.sample(frac=0.01).index, 'study_hours_per_week'] = '-5'

# Несовпадающие данные
df.loc[df.sample(frac=0.02).index, 'extracurricular_activities'] = 'None'
df.loc[df.sample(frac=0.01).index, 'major'] = 'Music'

df.to_csv('preprocessed_student_data.csv', index=False)