# 0. Задание

Был проведен анализ позиций ДНК у нескольких пациентов. Исследовалось являются ли эти позиции в некотором смысле "отмеченными" (`1` если да и `0` иначе).
При исследовании проводится множество измерений материала и в результате для каждого пациента получают данные вида:

111 ~ 123  
010 ~ 12  
100 ~ 232  
011 ~ 94   

В данном примере 461 раз (123 + 12 + 232 + 94) проверялись 3 позиции ДНК и 123 случаях были помечены все три позиции, в 12 случаях только вторая позиция и т.д.

Вам даны аналогичные данные измерений для 20 позиций и 30 пациентов. Пациенты с "cncr" в их имени больны раком, пациенты с "cntrl" - нет.

Требуется определить, можно ли диагностировать рак по данным измерениям 20 генов. Для этого следует использовать следующие переменные:
1. Посчитать для пациентов долю измерений, в которых отмечена определенная позиция их ДНК. В примере для первой позиции это (123 + 232) / 461 = 0.77. Это даст 20 переменных.
2. Для каждой пары позиций рассмотреть четыре варианта их значений: (0, 0), (0, 1), (1, 0) и (1, 1). Посчитать частоту каждого из этих 4-ёх вариантов. Это даст 760 переменных.


После преобразования данных должно получиться 780 разных переменных (20 одиночных + 760 парных).
Далее требуется провести логистическую регрессию для диагностирования рака по:
1. каждой из этих 780 переменных;
2. каждой паре из этих 780 переменных.

После чего требуется найти какая комбинация переменных позволяет давать лучшие предсказания. Для отбора можно использовать любую "разумную" метрику, например, процент верных предсказаний.

Разбивать данные на тренировочную и тестовую группы не нужно.

# 1. Загрузка данных

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("cancer_data.csv" , dtype = {"Unnamed: 0" : str})
data

Unnamed: 0.1,Unnamed: 0,p_cntrl5062,p_cntrl8512,p_cntrl5029,p_cncr7136,p_cntrl2642,p_cncr2935,p_cntrl1116,p_cncr5626,p_cntrl1317,...,p_cntrl8815,p_cncr4019,p_cncr4932,p_cncr8502,p_cncr9042,p_cncr8563,p_cntrl3243,p_cncr1633,p_cncr3976,p_cncr8561
0,01001101001101110101,648141,13551,2332,84717,826380,17426,327672,77719,26133073.0,...,5308,1885,4198,11761,9357,82,1458,3378,658874,5364
1,10100001101000101011,327972,450,596297,64928,18762,15337,14526,13223,93245.0,...,237,10063,28843,11965,12821,2209276,19195,35411,87369,34164
2,00001111010000010100,9367,8329,1779334,193804,1554,30004,4568,54491,92580.0,...,3048661,130081,9261,88916,1438,2863,2431,120,1159454,2619194
3,00100111010110100101,113918,40735,2439,277438,10408,7245,22275,511503,467215.0,...,206101,856,7181,20890,131290,213,13326,2216,204236,487775
4,00111110010000000010,57347,17715,37874,126782,502738,3569,3818879,63,5.0,...,2905373,2184800,7125721,435534,39126,23637,48453,1203595,72359,37433
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4985,10000101010110111001,324166,8387,83040,809439,1703,42392,537,18469,2465271.0,...,10050,60527,13012,52074,427147,627,812,27652331,1156,173864
4986,01101010111101000010,89765,1868,268496,40635,5551348,26053,37,601583,629981.0,...,5886,120,44955,503,877217,330449,9585,44202,1162,233553
4987,11110011010011111011,6159,26648,91958,2249922,6822,286972,4270,66366,1142327.0,...,821132,12707,647830,93420,1051,668,76223,233908,425300,4955
4988,00001111111111001111,147551,254475,263186,681604,13432,2059,18662,73592,161037.0,...,14278,13554,470785,52606,8744,49979,4007,2084410,12326,359


# 2. Обработка данных

## 2.1. Создание первых 20 переменных

In [12]:
def first_vars(number, patient):
    return sum(data.loc[[x[number] == "1" for x in data.iloc[:, 0]], patient]) / sum(data.loc[:, patient])

In [14]:
df1 = pd.DataFrame([[first_vars(j, i) for i in data.columns[1:]] for j in range(20)])
df1 = df1.T
df1.index = data.columns[1:]

In [16]:
# первые 20 переменных
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
p_cntrl5062,0.462747,0.572106,0.516743,0.525061,0.347616,0.499543,0.617883,0.511539,0.482101,0.513342,0.572221,0.546231,0.525922,0.519118,0.539493,0.533262,0.430178,0.504275,0.417657,0.590758
p_cntrl8512,0.530833,0.455351,0.487968,0.482856,0.357667,0.464731,0.675723,0.480339,0.448679,0.473796,0.529463,0.469548,0.487793,0.52711,0.50019,0.496392,0.615607,0.435053,0.543994,0.489269
p_cntrl5029,0.520577,0.530358,0.493423,0.484561,0.358298,0.504457,0.642752,0.498532,0.466138,0.507472,0.460317,0.465169,0.47128,0.544826,0.579284,0.491264,0.420894,0.463746,0.568867,0.535452
p_cncr7136,0.502015,0.539793,0.48781,0.51602,0.515928,0.603226,0.504676,0.571242,0.500993,0.523456,0.393616,0.551598,0.479467,0.498843,0.492279,0.542611,0.54934,0.58295,0.512793,0.445364
p_cntrl2642,0.537916,0.557015,0.538617,0.496017,0.39718,0.443195,0.608925,0.469859,0.494188,0.513639,0.51379,0.485646,0.513222,0.538431,0.485311,0.541368,0.51274,0.50673,0.50005,0.423912
p_cncr2935,0.591003,0.53504,0.55015,0.402817,0.459503,0.508086,0.510628,0.549578,0.49767,0.464807,0.452171,0.551645,0.49314,0.546187,0.542177,0.495912,0.550495,0.466444,0.481604,0.369175
p_cntrl1116,0.473126,0.528132,0.398435,0.489751,0.400609,0.541216,0.561822,0.518668,0.520288,0.503406,0.475245,0.494774,0.432388,0.491124,0.49871,0.457408,0.394131,0.426518,0.43868,0.633253
p_cncr5626,0.491535,0.440708,0.445083,0.486188,0.37506,0.584717,0.656273,0.463297,0.402708,0.519046,0.465063,0.536971,0.534539,0.533373,0.555801,0.495935,0.645842,0.484593,0.515339,0.411078
p_cntrl1317,0.477747,0.475332,0.438778,0.489398,0.540296,0.512086,0.516041,0.485639,0.481767,0.49961,0.52715,0.525572,0.453427,0.511991,0.481163,0.553439,0.288529,0.487871,0.520267,0.708688
p_cncr3717,0.581856,0.436746,0.467895,0.503937,0.383839,0.478705,0.555742,0.567083,0.479335,0.419431,0.522687,0.575284,0.623312,0.545709,0.489454,0.449,0.693906,0.491471,0.415596,0.397917


## 2.2. Создание 760 переменных

##### Создание по 190 переменных по всевозможным парам позиций (number1, number2), значение для которых = 
##### (0, 0) если pair = 1
##### (0, 1) если pair = 2
##### (1, 0) если pair = 3
##### (1, 1) если pair = 4

In [25]:
def second_vars0(pair, number1, number2, patient):
    symbols = {1: "00", 2: "01", 3: "10", 4: "11"}
    return sum(data.loc[[(x[number1] == symbols[pair][0] and x[number2] == symbols[pair][1]) for x in data.iloc[:, 0]], patient]) / sum(data.loc[:, patient])


def second_vars(pair):
    d = list()
    for patient in data.columns[1:]:
        var = list()
        for n1 in range(19):
            for n2 in range(n1 + 1, 20):
                var.append(second_vars0(pair, n1, n2, patient))
        d.append(var)
    d = pd.DataFrame(d)
    return d

In [26]:
from itertools import combinations


d_pairs = list()
d = pd.DataFrame()
for i in range(0, 4):
    d = second_vars(i + 1)
    labels = {0: "[0/0]", 1: "[0/1]", 2: "[1/0]", 3: "[1/1]"}
    d.columns = map(lambda x: "{0} ~ {1}".format(x, labels[i]), combinations(range(20), 2))
    d_pairs.append(d)

In [27]:
df2 = pd.concat(d_pairs, axis=1)
df2.index = data.columns[1:]

In [29]:
# Следующие 760 переменных
df2

Unnamed: 0,"(0, 1) ~ [0/0]","(0, 2) ~ [0/0]","(0, 3) ~ [0/0]","(0, 4) ~ [0/0]","(0, 5) ~ [0/0]","(0, 6) ~ [0/0]","(0, 7) ~ [0/0]","(0, 8) ~ [0/0]","(0, 9) ~ [0/0]","(0, 10) ~ [0/0]",...,"(15, 16) ~ [1/1]","(15, 17) ~ [1/1]","(15, 18) ~ [1/1]","(15, 19) ~ [1/1]","(16, 17) ~ [1/1]","(16, 18) ~ [1/1]","(16, 19) ~ [1/1]","(17, 18) ~ [1/1]","(17, 19) ~ [1/1]","(18, 19) ~ [1/1]"
p_cntrl5062,0.228438,0.231989,0.226259,0.366479,0.265256,0.218489,0.234093,0.25287,0.292635,0.205559,...,0.225013,0.291093,0.231048,0.33093,0.212081,0.204747,0.22888,0.216742,0.308532,0.23609
p_cntrl8512,0.252525,0.264362,0.245559,0.287193,0.269076,0.16116,0.217076,0.253383,0.241498,0.237213,...,0.316035,0.206316,0.308911,0.227748,0.254481,0.326802,0.348539,0.223796,0.20912,0.264799
p_cntrl5029,0.232457,0.25546,0.254538,0.319204,0.215065,0.152377,0.239208,0.232891,0.224168,0.240966,...,0.224676,0.212432,0.301079,0.281691,0.182827,0.238422,0.199041,0.285918,0.266724,0.300634
p_cncr7136,0.205747,0.240593,0.262527,0.240633,0.213957,0.250882,0.20235,0.258337,0.227172,0.309148,...,0.31739,0.349509,0.274053,0.255908,0.320352,0.274145,0.319911,0.262657,0.279276,0.221042
p_cntrl2642,0.216222,0.208439,0.231694,0.2762,0.261166,0.191496,0.231401,0.246059,0.213783,0.213339,...,0.301538,0.258068,0.273261,0.225118,0.251707,0.255343,0.243907,0.269541,0.228683,0.240521
p_cncr2935,0.211038,0.186486,0.234985,0.19943,0.173745,0.238572,0.191918,0.210641,0.208988,0.215379,...,0.251954,0.286917,0.223289,0.191118,0.243974,0.263608,0.244612,0.190427,0.161866,0.193296
p_cntrl1116,0.203562,0.307519,0.236835,0.320584,0.18725,0.211891,0.251205,0.266239,0.271244,0.294319,...,0.205503,0.226451,0.243455,0.298854,0.175856,0.208449,0.223823,0.200383,0.278689,0.303304
p_cncr5626,0.273474,0.274174,0.259044,0.286209,0.205534,0.190109,0.293304,0.310941,0.280876,0.257449,...,0.336754,0.20469,0.249743,0.199015,0.304778,0.343641,0.355709,0.260131,0.202317,0.254002
p_cntrl1317,0.278173,0.305645,0.236412,0.208354,0.244921,0.261373,0.289669,0.255632,0.270902,0.250645,...,0.143136,0.253179,0.295198,0.403828,0.141727,0.152968,0.136099,0.263378,0.329206,0.381268
p_cncr3717,0.231436,0.23202,0.254268,0.259477,0.248908,0.207991,0.147788,0.241024,0.245638,0.17197,...,0.286158,0.234704,0.184928,0.168373,0.346001,0.28415,0.38514,0.182795,0.167592,0.175474


## 2.3. Создание переменной target

In [30]:
def disease(x):
    if x[2:6] == "cncr" :
        return 1
    return 0

In [31]:
# добавление переменной cncr = 1 если у пациента cncr, 0 иначе
df3 = pd.DataFrame([disease(x) for x in data.columns[1:]])
df3.columns = ["cncr"]
df3.index = data.columns[1:]
df3

Unnamed: 0,cncr
p_cntrl5062,0
p_cntrl8512,0
p_cntrl5029,0
p_cncr7136,1
p_cntrl2642,0
p_cncr2935,1
p_cntrl1116,0
p_cncr5626,1
p_cntrl1317,0
p_cncr3717,1


## 2.4. Создание DataFrame для 30 пациентов с 781 переменными

In [32]:
df = pd.concat([df1, df2, df3], axis = 1)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,"(15, 17) ~ [1/1]","(15, 18) ~ [1/1]","(15, 19) ~ [1/1]","(16, 17) ~ [1/1]","(16, 18) ~ [1/1]","(16, 19) ~ [1/1]","(17, 18) ~ [1/1]","(17, 19) ~ [1/1]","(18, 19) ~ [1/1]",cncr
p_cntrl5062,0.462747,0.572106,0.516743,0.525061,0.347616,0.499543,0.617883,0.511539,0.482101,0.513342,...,0.291093,0.231048,0.33093,0.212081,0.204747,0.22888,0.216742,0.308532,0.23609,0
p_cntrl8512,0.530833,0.455351,0.487968,0.482856,0.357667,0.464731,0.675723,0.480339,0.448679,0.473796,...,0.206316,0.308911,0.227748,0.254481,0.326802,0.348539,0.223796,0.20912,0.264799,0
p_cntrl5029,0.520577,0.530358,0.493423,0.484561,0.358298,0.504457,0.642752,0.498532,0.466138,0.507472,...,0.212432,0.301079,0.281691,0.182827,0.238422,0.199041,0.285918,0.266724,0.300634,0
p_cncr7136,0.502015,0.539793,0.48781,0.51602,0.515928,0.603226,0.504676,0.571242,0.500993,0.523456,...,0.349509,0.274053,0.255908,0.320352,0.274145,0.319911,0.262657,0.279276,0.221042,1
p_cntrl2642,0.537916,0.557015,0.538617,0.496017,0.39718,0.443195,0.608925,0.469859,0.494188,0.513639,...,0.258068,0.273261,0.225118,0.251707,0.255343,0.243907,0.269541,0.228683,0.240521,0
p_cncr2935,0.591003,0.53504,0.55015,0.402817,0.459503,0.508086,0.510628,0.549578,0.49767,0.464807,...,0.286917,0.223289,0.191118,0.243974,0.263608,0.244612,0.190427,0.161866,0.193296,1
p_cntrl1116,0.473126,0.528132,0.398435,0.489751,0.400609,0.541216,0.561822,0.518668,0.520288,0.503406,...,0.226451,0.243455,0.298854,0.175856,0.208449,0.223823,0.200383,0.278689,0.303304,0
p_cncr5626,0.491535,0.440708,0.445083,0.486188,0.37506,0.584717,0.656273,0.463297,0.402708,0.519046,...,0.20469,0.249743,0.199015,0.304778,0.343641,0.355709,0.260131,0.202317,0.254002,1
p_cntrl1317,0.477747,0.475332,0.438778,0.489398,0.540296,0.512086,0.516041,0.485639,0.481767,0.49961,...,0.253179,0.295198,0.403828,0.141727,0.152968,0.136099,0.263378,0.329206,0.381268,0
p_cncr3717,0.581856,0.436746,0.467895,0.503937,0.383839,0.478705,0.555742,0.567083,0.479335,0.419431,...,0.234704,0.184928,0.168373,0.346001,0.28415,0.38514,0.182795,0.167592,0.175474,1


In [33]:
sorted_df = df.sort_values(by = "cncr")
sorted_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,"(15, 17) ~ [1/1]","(15, 18) ~ [1/1]","(15, 19) ~ [1/1]","(16, 17) ~ [1/1]","(16, 18) ~ [1/1]","(16, 19) ~ [1/1]","(17, 18) ~ [1/1]","(17, 19) ~ [1/1]","(18, 19) ~ [1/1]",cncr
p_cntrl5062,0.462747,0.572106,0.516743,0.525061,0.347616,0.499543,0.617883,0.511539,0.482101,0.513342,...,0.291093,0.231048,0.33093,0.212081,0.204747,0.22888,0.216742,0.308532,0.23609,0
p_cntrl9875,0.447097,0.546533,0.440201,0.46089,0.275959,0.461445,0.687262,0.587683,0.500234,0.560227,...,0.234235,0.31343,0.188039,0.271404,0.386887,0.276132,0.176415,0.162028,0.163304,0
p_cntrl8815,0.46227,0.541178,0.459234,0.611014,0.529849,0.518526,0.729697,0.467104,0.437014,0.556843,...,0.224926,0.226475,0.223676,0.303116,0.252869,0.269166,0.247388,0.291001,0.27832,0
p_cntrl2476,0.403381,0.57213,0.49215,0.513787,0.495291,0.537423,0.75951,0.359524,0.541245,0.460159,...,0.284663,0.271506,0.278801,0.325397,0.263956,0.314311,0.247756,0.374185,0.26918,0
p_cntrl1116,0.473126,0.528132,0.398435,0.489751,0.400609,0.541216,0.561822,0.518668,0.520288,0.503406,...,0.226451,0.243455,0.298854,0.175856,0.208449,0.223823,0.200383,0.278689,0.303304,0
p_cntrl1317,0.477747,0.475332,0.438778,0.489398,0.540296,0.512086,0.516041,0.485639,0.481767,0.49961,...,0.253179,0.295198,0.403828,0.141727,0.152968,0.136099,0.263378,0.329206,0.381268,0
p_cntrl2642,0.537916,0.557015,0.538617,0.496017,0.39718,0.443195,0.608925,0.469859,0.494188,0.513639,...,0.258068,0.273261,0.225118,0.251707,0.255343,0.243907,0.269541,0.228683,0.240521,0
p_cntrl5029,0.520577,0.530358,0.493423,0.484561,0.358298,0.504457,0.642752,0.498532,0.466138,0.507472,...,0.212432,0.301079,0.281691,0.182827,0.238422,0.199041,0.285918,0.266724,0.300634,0
p_cntrl8512,0.530833,0.455351,0.487968,0.482856,0.357667,0.464731,0.675723,0.480339,0.448679,0.473796,...,0.206316,0.308911,0.227748,0.254481,0.326802,0.348539,0.223796,0.20912,0.264799,0
p_cntrl3243,0.465006,0.496341,0.535129,0.492914,0.340986,0.551096,0.651319,0.492539,0.501878,0.474576,...,0.208338,0.210329,0.234933,0.234055,0.204415,0.229164,0.238898,0.268096,0.251949,0


# 3. Logistic regression analysis

In [34]:
from sklearn import linear_model

## 3.1 Предсказание по одной переменной

In [35]:
from multiprocessing import Pool

In [36]:
#Содержимое regr.py

#from sklearn import linear_model
#def one_var_regr(df_var):   
#    log_regr1 = linear_model.LogisticRegression(penalty = 'none')
#    log_regr1.fit(df_var.iloc[:, :1], df_var["cncr"])
#    return log_regr1.score(df_var.iloc[:, :1], df_var["cncr"])
#...

In [43]:
from multiprocessing import Pool
import regr

if __name__ ==  '__main__': 
    p = Pool()
    output = p.map_async(regr.one_var_regr,[pd.DataFrame.join(pd.DataFrame(df.loc[:, i]), df3) for i in df.columns[:-1]])

In [44]:
df_results_one = pd.DataFrame(output.get())
df_results_one.columns = ["score"]
df_results_one.index = df.columns[df.columns!='cncr']
df_results_one = df_results_one.sort_values(by = "score", ascending=False)
df_results_one

Unnamed: 0,score
"(6, 19) ~ [1/1]",0.900000
"(14, 19) ~ [1/1]",0.900000
"(1, 16) ~ [1/0]",0.866667
"(9, 19) ~ [1/0]",0.866667
"(11, 19) ~ [0/1]",0.866667
...,...
"(0, 9) ~ [1/1]",0.600000
"(1, 15) ~ [0/0]",0.600000
"(5, 8) ~ [1/0]",0.600000
"(0, 8) ~ [1/1]",0.566667


In [45]:
df_results_one.iloc[0] # гены 6 и 19 для отметок (1, 1) (нумерация генов с нуля)

score    0.9
Name: (6, 19) ~ [1/1], dtype: float64

In [46]:
df_results_one.iloc[1] # гены 14 и 19 для отметок (1, 1) (нумерация генов с нуля)

score    0.9
Name: (14, 19) ~ [1/1], dtype: float64

## 3.2 Предсказание по паре переменных

In [47]:
comb = list(combinations(range(780), 2))

In [48]:
# Содержимое regr.py (продолжение)

#def pair_var_regr(df_var):   
#    log_regr2 = linear_model.LogisticRegression(penalty = 'none')
#    log_regr2.fit(df_var.iloc[:, :2], df_var["cncr"])
#    return log_regr2.score(df_var.iloc[:, :2], df_var["cncr"])

In [49]:
if __name__ ==  '__main__': 
    p = Pool()
    output_pair = p.map_async(regr.pair_var_regr,[pd.DataFrame.join(pd.DataFrame(df.iloc[:, [x[0] , x[1]]]), df3) for x in comb])

In [50]:
df_results_pair = pd.DataFrame(output_pair.get())

In [51]:
df_results_pair.columns = ["score"]
df_results_pair.index = map(lambda x: "{} ;; {}".format(df.columns[x[0]] , df.columns[x[1]]), comb)
df_results_pair

Unnamed: 0,score
0 ;; 1,0.733333
0 ;; 2,0.666667
0 ;; 3,0.666667
0 ;; 4,0.666667
0 ;; 5,0.633333
...,...
"(16, 19) ~ [1/1] ;; (17, 19) ~ [1/1]",0.866667
"(16, 19) ~ [1/1] ;; (18, 19) ~ [1/1]",0.800000
"(17, 18) ~ [1/1] ;; (17, 19) ~ [1/1]",0.800000
"(17, 18) ~ [1/1] ;; (18, 19) ~ [1/1]",0.733333


In [52]:
df_results_pair = df_results_pair.sort_values(by = "score", ascending = False, kind = "mergesort")
df_results_pair

Unnamed: 0,score
"(1, 6) ~ [1/1] ;; (8, 19) ~ [1/1]",1.000000
"6 ;; (16, 19) ~ [0/0]",0.966667
"(3, 4) ~ [0/0] ;; (11, 19) ~ [0/1]",0.966667
"(4, 5) ~ [0/0] ;; (10, 19) ~ [0/0]",0.966667
"(6, 17) ~ [0/0] ;; (16, 19) ~ [0/0]",0.966667
...,...
"(2, 6) ~ [1/1] ;; (12, 18) ~ [1/1]",0.533333
"(5, 18) ~ [1/1] ;; (6, 17) ~ [1/1]",0.533333
"(7, 15) ~ [1/1] ;; (10, 13) ~ [1/1]",0.533333
"(5, 14) ~ [0/0] ;; (14, 18) ~ [0/1]",0.500000


In [53]:
df_results_pair.iloc[0] 
# гены 8 и 19 для отметок (1, 1) (нумерация генов с нуля)
# гены 1 и 6 для отметок (1, 1) (нумерация генов с нуля)

score    1.0
Name: (1, 6) ~ [1/1] ;; (8, 19) ~ [1/1], dtype: float64