In [1]:
# Question: Detecting Data Drift
# Description: Identify potential data drift between two time periods for a numeric attribute.


In [2]:
import pandas as pd
import os # Import the os module for file path operations (optional, but good practice)

# --- Define Sample Data ---
# This dictionary represents the data you want to save into your CSV.
# Each key will become a column header, and its value will be a list
# containing the data for that column. Ensure all lists are of the same length.
sample_data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Heidi', 'Ivan', 'Judy'],
    'Age': [24, 30, 35, 28, 42, 55, 29, 31, 40, 27],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 'San Jose'],
    'Occupation': ['Engineer', 'Artist', 'Doctor', 'Teacher', 'Designer', 'Manager', 'Analyst', 'Consultant', 'Developer', 'Salesperson'],
    'Salary': [70000, 55000, 120000, 60000, 80000, 95000, 65000, 72000, 100000, 58000]
}

# --- Create a Pandas DataFrame from the Sample Data ---
# A DataFrame is a 2-dimensional labeled data structure with columns of potentially
# different types. It's similar to a spreadsheet or SQL table.
df = pd.DataFrame(sample_data)

# --- Define the CSV File Path ---
# This is the name of the CSV file that will be created.
# By default, it will be saved in the same directory where your Python script is run.
csv_file_name = 'my_generated_data.csv'

# --- Save the DataFrame to a CSV File ---
# .to_csv() is a Pandas method that writes the DataFrame to a CSV file.
# 'index=False' is crucial: it prevents Pandas from writing the DataFrame's
# internal index (0, 1, 2, ...) as an extra column in your CSV file.
try:
    df.to_csv(csv_file_name, index=False)
    print(f"CSV file '{csv_file_name}' created successfully!")
    print(f"It contains {len(df)} rows and {len(df.columns)} columns.")
    print("\nFirst 5 rows of the generated data:")
    print(df.head()) # Display the first few rows to confirm content
except Exception as e:
    print(f"An error occurred while creating the CSV file: {e}")

# You can optionally verify the file exists
if os.path.exists(csv_file_name):
    print(f"\nFile '{csv_file_name}' found at: {os.path.abspath(csv_file_name)}")
else:
    print(f"\nFile '{csv_file_name}' was not found after creation attempt.")



CSV file 'my_generated_data.csv' created successfully!
It contains 10 rows and 5 columns.

First 5 rows of the generated data:
      Name  Age         City Occupation  Salary
0    Alice   24     New York   Engineer   70000
1      Bob   30  Los Angeles     Artist   55000
2  Charlie   35      Chicago     Doctor  120000
3    David   28      Houston    Teacher   60000
4      Eve   42      Phoenix   Designer   80000

File 'my_generated_data.csv' found at: /workspaces/AI_DATA_ANALYSIS_/src/Module 3/Identifying & Measuring Data Quality Issues/my_generated_data.csv


In [3]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp

# Simulate old data with mean 40, std 10
df_old = pd.DataFrame({
    'Age': np.random.normal(40, 10, 1000)
})

# Simulate new data with mean 45, std 10 (shifted distribution)
df_new = pd.DataFrame({
    'Age': np.random.normal(45, 10, 1000)
})

# Detect drift using Kolmogorov-Smirnov test
stat, p_value = ks_2samp(df_old['Age'], df_new['Age'])
print(f"KS statistic: {stat}")
print(f"P-value: {p_value}")

if p_value < 0.05:
    print("Data drift detected in 'Age' distribution.")
else:
    print("No significant data drift detected.")


KS statistic: 0.212
P-value: 4.397213520931296e-20
Data drift detected in 'Age' distribution.


In [4]:
import pandas as pd
import numpy as np

# Simulate old data
df_old = pd.DataFrame({
    'Age': np.random.normal(40, 10, 1000)  # mean 40, std 10, 1000 samples
})

# Simulate new data (shifted mean to 45)
df_new = pd.DataFrame({
    'Age': np.random.normal(45, 10, 1000)
})
