In [1]:
import pandas as pd

# Load the data
file_path = '23andMe_data.txt'
data = pd.read_csv(file_path, sep='\t', comment='#', header=None)

# Display the first few rows of the dataframe
data.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,0,1,2,3
0,rs12564807,1,734462,AA
1,rs3131972,1,752721,AG
2,rs148828841,1,760998,CC
3,rs12124819,1,776546,AG
4,rs115093905,1,787173,--


In [2]:
# Renaming columns for clarity
# Assuming the columns represent ['rsid', 'chromosome', 'position', 'genotype']
data.columns = ['rsid', 'chromosome', 'position', 'genotype']

# Sorting the data
# Assuming sorting by 'chromosome' and 'position' in ascending order
sorted_data = data.sort_values(by=['chromosome', 'position'])

# Saving the cleaned and sorted data to a new CSV file
sorted_data.to_csv('Cleaned_23andMe.csv', index=False)

# Display the first few rows of the sorted dataframe
sorted_data.head()

Unnamed: 0,rsid,chromosome,position,genotype
0,rs12564807,1,734462,AA
1,rs3131972,1,752721,AG
2,rs148828841,1,760998,CC
3,rs12124819,1,776546,AG
4,rs115093905,1,787173,--


In [3]:
# Dropping rows with empty values or '--' in the 'genotype' column
filtered_data = sorted_data[sorted_data['genotype'] != '--']

# Saving the filtered and sorted data to a new CSV file
filtered_data.to_csv('Cleaned_23andMe.csv', index=False)

# Display the first few rows of the filtered dataframe
filtered_data.head()

Unnamed: 0,rsid,chromosome,position,genotype
0,rs12564807,1,734462,AA
1,rs3131972,1,752721,AG
2,rs148828841,1,760998,CC
3,rs12124819,1,776546,AG
5,rs11240777,1,798959,AG


In [4]:
# Dropping the 'rsid' and 'chromosome' columns
final_data = filtered_data.drop(['rsid', 'chromosome'], axis=1)

# Saving the final data to a new CSV file
final_data.to_csv('Cleaned_23andMe.csv', index=False)

# Display the first few rows of the final dataframe
final_data.head()

Unnamed: 0,position,genotype
0,734462,AA
1,752721,AG
2,760998,CC
3,776546,AG
5,798959,AG


In [5]:
# Splitting the genotype column into separate rows
expanded_data = final_data.set_index('position')['genotype'].apply(list).explode().reset_index()

# Renaming the column for clarity
expanded_data.columns = ['position', 'genotype']

# Saving the expanded data to a new CSV file
expanded_data.to_csv('Expanded_23andMe.csv', index=False)

# Display the first few rows of the expanded dataframe
expanded_data.head()

Unnamed: 0,position,genotype
0,734462,A
1,734462,A
2,752721,A
3,752721,G
4,760998,C


In [6]:
# Correcting the code to split the genotype column into separate rows
# First, we need to revert to the data before dropping 'rsid' and 'chromosome' columns
final_data = filtered_data.drop(['rsid', 'chromosome'], axis=1)

# Now, splitting the genotype column
expanded_data = final_data.set_index('position')['genotype'].apply(list).explode().reset_index()

# Renaming the column for clarity
expanded_data.columns = ['position', 'genotype']

# Saving the expanded data to a new CSV file
expanded_data.to_csv('Expanded_23andMe.csv', index=False)

# Display the first few rows of the expanded dataframe
expanded_data.head()

Unnamed: 0,position,genotype
0,734462,A
1,734462,A
2,752721,A
3,752721,G
4,760998,C


In [7]:
# Renaming the 'genotype' column to 'Ref' and creating the 'Alt' column
expanded_data.rename(columns={'genotype': 'Ref'}, inplace=True)

# Function to pair the genotype
def pair_genotype(genotype):
    pairing = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    return pairing.get(genotype, '')

# Applying the function to create the 'Alt' column
expanded_data['Alt'] = expanded_data['Ref'].apply(pair_genotype)

# Saving the updated data to a new CSV file
expanded_data.to_csv('Paired_Expanded_23andMe.csv', index=False)

# Display the first few rows of the updated dataframe
expanded_data.head()

Unnamed: 0,position,Ref,Alt
0,734462,A,T
1,734462,A,T
2,752721,A,T
3,752721,G,C
4,760998,C,G


# Data Cleaning and Processing Steps with Function Descriptions

This notebook documents the steps taken to clean and process the 23andMe genetic data, including the functions used. Here's a detailed breakdown of the process:

1. **Loading the Data**: Used `pd.read_csv` to load the raw data from '23andMe_data.txt' into a pandas DataFrame. The data includes columns for rsid, chromosome, position, and genotype.

2. **Initial Cleaning**: Applied DataFrame methods to remove rows with empty values or '--' in the 'genotype' column, ensuring only valid genetic data is considered.

3. **Dropping Unnecessary Columns**: Used `DataFrame.drop` to remove 'rsid' and 'chromosome' columns, focusing on relevant data for analysis.

4. **Expanding Genotype Data**: Utilized `DataFrame.set_index` and `Series.explode` to expand the 'genotype' column so each row contains only one gene (A, T, C, or G).

5. **Renaming and Pairing Genotypes**: Renamed 'genotype' to 'Ref' using `DataFrame.rename`. Created a custom function `pair_genotype` to pair genotypes (A with T, and C with G) and applied it to form the 'Alt' column.

6. **Saving Processed Data**: Saved the processed data into CSV files at each significant step using `DataFrame.to_csv`, facilitating easy access and further analysis.