# Importing packages and reading the table

In [1]:
import csv
import pandas as pd

In [2]:
df = pd.read_excel("messy_sample_sheet.xlsx")
df

Unnamed: 0,SID,Condition,Sex,Batch,PMI,RIN,Liver,BrainPH,BrainWeight,LeftRightBrain,Smoking
0,SRR15466722,Control,male,2.0,29,49,MildSteatosis,6.12,1320,Left,Never
1,SRR15466723,Control,Female,1.0,15,7.3,Normal,6.93,1330,Right,
2,SRR15466724,Control,female,2.0,11,3.4,Normal,6.21,1200,Left,Current
3,SRR15466725,AlcoholUseDisorder,female,1.0,38,7.8,Normal,6.54,1180,Right,Current
4,SRR15466742,Control,female,2.0,23,4.6,Congestion,6.17,1340,Left,NAN
5,SRR15466743,AlcoholUseDisorder,male,1.0,39.5,8,Cirrhosis,6.34,1412,Left,Current
6,SRR15466744,Control,femalle,1.0,29.5,8.2,Normal,6.78,1195,Left,Never
7,SRR15466745,AlcoholUseDisorder,female,2.0,37,68,Steatosis,6.95,1139,Right,Never
8,SRR15466746,AlcoholUseDisorder,male,2.0,61,7.3,Steatosis,6.79,1670,Left,Current
9,SRR15466729,Control,male,1.0,12,7.6,Steatosis,6.39,1631,Left,Never


# Things to check/fix

- If SIDs are all unique
- Condition, Sex, Liver, LeftRightBrain, Smoking values standartized (no values meaning the same thing)
- Numbers with with correct demoninator (either comma or dot)
- Find missing values, decide what to do with them

## Initial step: removing sample that's obviously bad

In other words we can check which samples have a lot of missing values and it's easier to check with numeric values. Even though Batch is categorical, numbers are used.

In [3]:
numeric_cols = ["Batch", "PMI", "RIN", "BrainPH", "BrainWeight"]
sids_to_remove = []
for i in range(df.shape[0]):
    failed_check = 0
    for nc in numeric_cols:
        val = str(df[nc][i]).replace(",", ".")
        try:
            float(val)
        except ValueError:
            failed_check += 1
    if failed_check > 0:
        print(f"{df['SID'][i]} failed {failed_check} checks")
        sids_to_remove.append(df["SID"][i])
sids_to_remove

SRA15500000 failed 5 checks


['SRA15500000']

In [4]:
print(df.shape)
df = df.loc[~df["SID"].isin(sids_to_remove)]
print(df.shape)

(25, 11)
(24, 11)


## Checking if SIDs are all unique

- set() leaves only unique values
- True means that there are no duplicate values

In [5]:
len(df["SID"]) == len(set(df["SID"]))

True

## Standartizing Condition values

In [6]:
set(df["Condition"])

{'AlcoholUseDisorder', 'AlcoholUsedisorder', 'Control'}

In [7]:
conds = [str(val).replace('AlcoholUsedisorder', 'AlcoholUseDisorder') for val in df["Condition"]]
df = df.copy()
df.loc[:, "Condition"] = conds

In [8]:
set(df["Condition"])

{'AlcoholUseDisorder', 'Control'}

## Standartizing Sex values

Variation is bigger so syntax won't be so easy as with Condition

In [9]:
set(df["Sex"])

{'Female', 'Male', 'f', 'female', 'femalle', 'male'}

In [10]:
new_values = []
for s in df["Sex"]:
    if s in ['Female', 'f', 'female', 'femalle']:
        new_values.append("Female")
    elif s in ['Male', 'male']:
        new_values.append("Male")
    else:
        print(f"Undetected value: {s}")
df.loc[:, "Sex"] = new_values

In [11]:
set(df["Sex"])

{'Female', 'Male'}

## Standartizing Liver values

- Again, same names must mean the same thing
- Cirrhosis, congestion and steatosis seem to be different conditions and can't be merged
- Though MildSteatosis will be merged with just Steatosis because levels of conditions aren't being examined and there is no information whether just Steatosis isn't mild

In [12]:
set(df["Liver"])

{'Cirrhosis',
 'Cirrhosiss',
 'Congestion',
 'Conkestion',
 'MildSteatosis',
 'Normal',
 'Steatosis',
 'Steatosiss',
 'normal'}

In [13]:
new_values = []
for s in df["Liver"]:
    if s in ['Cirrhosis', 'Cirrhosiss']:
        new_values.append("Cirrhosis")
    elif s in ['Congestion', 'Conkestion']:
        new_values.append("Congestion")
    elif s in ['MildSteatosis', 'Steatosis', 'Steatosiss']:
        new_values.append("Steatosis")
    elif s in ['Normal', 'normal']:
        new_values.append("Normal")
    else:
        print(f"Undetected value: {s}")
df.loc[:, "Liver"] = new_values

In [14]:
set(df["Liver"])

{'Cirrhosis', 'Congestion', 'Normal', 'Steatosis'}

## Standartizing LeftRightBrain values

- based on result no standartization needed

In [15]:
set(df["LeftRightBrain"])

{'Left', 'Right'}

## Standartizing Smoking values

In [16]:
set(df["Smoking"])

{'Current', 'Ex_smoker', 'Ex_smokers', 'NAN', 'Never', nan, 'never'}

In [17]:
new_values = []
for s in df["Smoking"]:
    if s in ['Current']:
        new_values.append("Current")
    elif s in ['Ex_smoker', 'Ex_smokers']:
        new_values.append("Ex_smoker")
    elif s in ['Never', 'never']:
        new_values.append("Never")
    elif s in ['NAN'] or pd.isna(s):
        new_values.append("Unknown")
    else:
        print(f"Undetected value: {s}")
df.loc[:, "Smoking"] = new_values

In [18]:
set(df["Smoking"])

{'Current', 'Ex_smoker', 'Never', 'Unknown'}

## Numbers with with correct demoninator (either comma or dot)

- numeric columns were defined before:
    * *numeric_cols = ["Batch", "PMI", "RIN", "BrainPH", "BrainWeight"]*

In [19]:
df = df.copy()
for nc in numeric_cols:
    new_values = []
    for i in range(df.shape[0]):
        val = str(list(df[nc])[i]).replace(",", ".")
        val = float(val) # no need for try block because I cleaned values before
        new_values.append(val)
    df.loc[:, nc] = new_values

  df.loc[:, nc] = new_values


# Final dataframe to see, also saving it to new file

In [20]:
df.to_excel("clean_sample_sheet.xlsx")
df

Unnamed: 0,SID,Condition,Sex,Batch,PMI,RIN,Liver,BrainPH,BrainWeight,LeftRightBrain,Smoking
0,SRR15466722,Control,Male,2.0,29.0,4.9,Steatosis,6.12,1320.0,Left,Never
1,SRR15466723,Control,Female,1.0,15.0,7.3,Normal,6.93,1330.0,Right,Unknown
2,SRR15466724,Control,Female,2.0,11.0,3.4,Normal,6.21,1200.0,Left,Current
3,SRR15466725,AlcoholUseDisorder,Female,1.0,38.0,7.8,Normal,6.54,1180.0,Right,Current
4,SRR15466742,Control,Female,2.0,23.0,4.6,Congestion,6.17,1340.0,Left,Unknown
5,SRR15466743,AlcoholUseDisorder,Male,1.0,39.5,8.0,Cirrhosis,6.34,1412.0,Left,Current
6,SRR15466744,Control,Female,1.0,29.5,8.2,Normal,6.78,1195.0,Left,Never
7,SRR15466745,AlcoholUseDisorder,Female,2.0,37.0,6.8,Steatosis,6.95,1139.0,Right,Never
8,SRR15466746,AlcoholUseDisorder,Male,2.0,61.0,7.3,Steatosis,6.79,1670.0,Left,Current
9,SRR15466729,Control,Male,1.0,12.0,7.6,Steatosis,6.39,1631.0,Left,Never


# Cleaning sequencing data

In [22]:
# I won't need sample sheet in this notebook so it's a good idea to overwrite to save some RAM
df = pd.read_csv("rnaseq_default_counts.csv")
df

Unnamed: 0.1,Unnamed: 0,SRR15466722,SRR15466723,SRR15466724,SRR15466725,SRR15466742,SRR15466743,SRR15466744,SRR15466745,SRR15466746,...,SRR15466728,SRR15466733,SRR15466734,SRR15466735,SRR15466736,SRR15466737,SRR15466738,SRR15466739,SRR15466740,SRR15466741
0,ENSG00000000003,146,28,74,53,157,47,38,90,73,...,53,261,250,29,148,37,88,25,28,71
1,ENSG00000000005,1,2,0,4,0,2,2,2,4,...,1,2,5,0,1,1,3,0,0,3
2,ENSG00000000419,287,229,206,299,284,291,277,460,373,...,232,1030,723,211,390,180,397,229,159,399
3,ENSG00000000457,225,142,154,211,156,156,205,271,328,...,134,710,408,116,234,155,230,174,93,283
4,ENSG00000000460,59,81,52,57,59,49,64,92,62,...,35,200,139,42,62,49,70,57,21,71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62695,ENSG00000292369,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
62696,ENSG00000292370,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
62697,ENSG00000292371,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
62698,ENSG00000292372,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Things to check/fix

- check missing data
- give proper name for ID column
- check if ID values are unique
- data normalization

## Checking if there any missing data

In [24]:
df.isnull().values.any()

False