In [None]:
from IPython.display import YouTubeVideo
YouTubeVideo("FEySqC72B1o")

# Finding Gene Variants from Literature

![condon](./AACodon.jpg)

![doublehelix](doublehelix.png)

In [None]:
import pandas as pd
import re
import numpy as np

In [None]:
##Read in dataframe
df = pd.read_excel("Yaeger2018.xlsx", header=0)
print(df.shape)
df.head()

### Select columns

In [None]:
#Select certain columns
df =  df[df.columns[0:4]]
df.head()

In [None]:
##Select certain columns
columns = ['Hugo_Symbol', 'HGVSc', 'HGVSp_Short', 'dbSNP_RS']

#fill any empty cell with an na value
df[columns] = df[columns].fillna("")
print(len(df))

#Drop any duplicate values
df.drop_duplicates(['HGVSc'], inplace=True)
print(len(df))

### Select genes

In [None]:
genes = ('AKT1','APC','APLF','ARID1A','AR','ATM','ATR','FRP1','AXIN2','BARD1','BLM','BMPR1A','BRAF','BRCA1','BRCA2',
'BRIP1','BTG2','CCND3','CCNE1','CDH1','CDK4','CDK12','CDKN1B','CHD4','CHEK1','CHEK2','KIT','CSMD3','CTCF','CUL3','DDR2',
'DHX9','BAP1','DNMT3A','EGFR','ERBB1','EDNRB','ELF3','EMSY','EP300','EPCAM','ERBB2','ERBB2IP','ERCC1','ERCC2','ERCC3',
'ERCC5','EYA4','FGFR2','FGFR3','FGFR4','FAM123B ','AMER1','FAM175A','ABRAXAS1','FANCA','FANCD2','FANCI','FANCL','FANCM',
'SLX4 ','ABRAXAS1','FBXW7','FOXA1','FOXP1','FOXQ1','GATA3','GREM1','GALNT12','FGFR1','HOXB13','PSGD','HDAC2','KDM6A',
'KEAP1','KRAS','KLF5','KMT2D','KMT2C','KMT2B','MAP2K1','MAP2K2','MAP2K4','MAP3K1','MET','MLH1','MLH3','MLLT4','MRE11A',
'MSH2','MSH3','MSH6','NBN','NCOR1','NCOR2','NF1','NFE2L2','NTHL1','NOTCH1','NOTCH2','NRAS','PAIP1','MYH','PALB2',
'PIK3CA','PMS2','POLD1','POLE','POLG','POLH','POLN','POLQ','PPPRR2A','PRKDC','PSMC3IP','PTPRD','PTEN','CDKN2A','P14ARF',
'P16','RAD51','RAD51B','RAD51C','RAD51D','RAD54L','RB1','RECQL','RBM10','RET','REV3L','RFC4','RHOA','RHOB','RIF1','RINT1',
'RPS20','PPP2R2A','RAD50','REPA1','RNF43','RUNX1','RXRA','SCG5','SF3B1','SMAD2','SMAD4','SMARCA1','SMARCA4','SOX9',
'SPOP','STAG2','STK11','TCF7L2','TP53','TSC1','TXNIP','TOP1','TOPO1','U2AF1','ZFP36L1','ZFP36L2')

In [None]:
df2 = df.loc[df['Hugo_Symbol'].isin(genes)]
print(len(df2))
df2.head()

In [None]:
YouTubeVideo("-AMpVbzsiYs")

### Find Variants
#### Substitution

**Pattern**: `c.2047C>T`

* `c.`
* A number
* A base (ACGT)
* A `>` symbol
* A base (ACGT)

```Python
r"c\.d+[ACTG]>[ACTG]"
```

### Exercise

Write a regular expression to capture the nuclitide position, the wild type base, and the variant base as shown in the video.

### Find Variants
#### Deletions

**Pattern**

`ENST00000374690.3:c.213_239delGCAGCAGCAGCAGCAGCAGCAGCAGCA'`
* `c.`
* number (deletion start)
* `_`
* number (deletion end)

### Exercise
#### Write a regular expression to capture deltion start and deletion end

In [None]:
YouTubeVideo("nNUE_PrfrQ8")

### p. Parsing `HGVSp_Short`

* Pattern: `p.Gln72_Gln80del`

### Exercise
#### Write a regular expression to find the first number in the protein name (e.g. 72 above)


In [None]:
df2["Protein_number"] = df2["HGVSp_Short"].apply(find3)
df2.head()

### Exercise
Use the regular expressions you wrote above and Pandas operations to create a Pandas DataFrame whose `head()` looks like the following.

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Hugo_Symbol</th>
      <th>HGVSc</th>
      <th>HGVSp_Short</th>
      <th>dbSNP_RS</th>
      <th>Protein_number</th>
      <th>nucltide_pos</th>
      <th>wild_type_Base</th>
      <th>variant_Base</th>
      <th>Del_Start</th>
      <th>Del_End</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>AR</td>
      <td>ENST00000374690.3:c.2047C&gt;T</td>
      <td>p.Pro683Ser</td>
      <td></td>
      <td>683</td>
      <td>2047</td>
      <td>C</td>
      <td>T</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>1</th>
      <td>AR</td>
      <td>ENST00000374690.3:c.213_239delGCAGCAGCAGCAGCAG...</td>
      <td>p.Gln72_Gln80del</td>
      <td></td>
      <td>72</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>213</td>
      <td>239</td>
    </tr>
    <tr>
      <th>2</th>
      <td>AR</td>
      <td>ENST00000374690.3:c.2246C&gt;A</td>
      <td>p.Ala749Asp</td>
      <td></td>
      <td>749</td>
      <td>2246</td>
      <td>C</td>
      <td>A</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>3</th>
      <td>AR</td>
      <td>ENST00000374690.3:c.2256G&gt;A</td>
      <td>p.Trp752*</td>
      <td></td>
      <td>752</td>
      <td>2256</td>
      <td>G</td>
      <td>A</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>4</th>
      <td>AR</td>
      <td>ENST00000374690.3:c.2296G&gt;A</td>
      <td>p.Ala766Thr</td>
      <td></td>
      <td>766</td>
      <td>2296</td>
      <td>G</td>
      <td>A</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
  </tbody>
</table>