In [1]:
from random import seed
from random import randrange
from math import sqrt

In [2]:
pip install pyreadr

Note: you may need to restart the kernel to use updated packages.


In [7]:
import pyreadr
import numpy as np
data = pyreadr.read_r('C:/Users/drobi/Downloads/Fibro.Cellview.Rds')
df=data
df

OrderedDict([('log2cpm',
                               AACGGTACCTTCGC_1  AGAGAAACGCCCTT_1  AGGCAGGACGAATC_1  \
              rownames                                                                
              ENSG00000228463          0.000000          0.000000               0.0   
              ENSG00000230021          0.000000          0.000000               0.0   
              ENSG00000237491          0.000000          0.000000               0.0   
              ENSG00000177757          0.000000          0.000000               0.0   
              ENSG00000225880          0.000000          0.000000               0.0   
              ...                           ...               ...               ...   
              ENSG00000212907          0.000000          0.000000               0.0   
              ENSG00000198886          3.187171          4.279238               0.0   
              ENSG00000198786          0.000000          2.330691               0.0   
              ENSG

In [11]:
#Converting String to Column to Float
def str_column_to_float(df, column):
    for row in df:
        row[column] = float(row[column].strip())        
        

In [12]:
#Convert String to Column to Integer
def str_column_to_int(df, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup
        
        

In [14]:
#Split a Dataset Into K Folds
def cross_validation_split(df, n_folds):
    df_split = list()
    df_copy = list(df)
    fold_size = int(len(df) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(df_copy))
            fold.append(df_copy.pop(index))
        df_split.append(fold)
    return df_split
    

In [15]:
#Calculate Accuracy Percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actualp[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [17]:
#Evaluate An Algorithm Using a Cross Validation Split
def evaluate_algorithm(df, algorithm, n_folds, *args):
    folds = cross_validation_split(df, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

In [20]:
#Split a Dataset Based on An Attribute and An Attribute Value
def test_split(index, value, df):
	left, right = list(), list()
	for row in df:
		if row[index] < value:
			left.append(row)
		else:
			right.append(row)
	return left, right

In [21]:
#Calculate the Gini Index for a Split Dataset
def gini_index(groups, classes):
	# count all samples at split point
	n_instances = float(sum([len(group) for group in groups]))
	# sum weighted Gini index for each group
	gini = 0.0
	for group in groups:
		size = float(len(group))
		# avoid divide by zero
		if size == 0:
			continue
		score = 0.0
		# score the group based on the score for each class
		for class_val in classes:
			p = [row[-1] for row in group].count(class_val) / size
			score += p * p
		# weight the group score by its relative size
		gini += (1.0 - score) * (size / n_instances)
	return gini

In [22]:
#Select the Best Split Point for a Dataset
def get_split(df, n_features):
	class_values = list(set(row[-1] for row in df))
	b_index, b_value, b_score, b_groups = 999, 999, 999, None
	features = list()
	while len(features) < n_features:
		index = randrange(len(df[0])-1)
		if index not in features:
			features.append(index)
	for index in features:
		for row in dataset:
			groups = test_split(index, row[index], df)
			gini = gini_index(groups, class_values)
			if gini < b_score:
				b_index, b_value, b_score, b_groups = index, row[index], gini, groups
	return {'index':b_index, 'value':b_value, 'groups':b_groups}
 

In [23]:
#Create a Terminal Node Value
def to_terminal(group):
	outcomes = [row[-1] for row in group]
	return max(set(outcomes), key=outcomes.count)
 

In [24]:
#Create Child Splits for a Node or Make Terminal
def split(node, max_depth, min_size, n_features, depth):
	left, right = node['groups']
	del(node['groups'])
	# Check for a No Split
	if not left or not right:
		node['left'] = node['right'] = to_terminal(left + right)
		return
	# Check for Max Depth
	if depth >= max_depth:
		node['left'], node['right'] = to_terminal(left), to_terminal(right)
		return
	# Process Left Child
	if len(left) <= min_size:
		node['left'] = to_terminal(left)
	else:
		node['left'] = get_split(left, n_features)
		split(node['left'], max_depth, min_size, n_features, depth+1)
	# Process Right Child
	if len(right) <= min_size:
		node['right'] = to_terminal(right)
	else:
		node['right'] = get_split(right, n_features)
		split(node['right'], max_depth, min_size, n_features, depth+1)

In [25]:
#Build a Decision Tree
def build_tree(train, max_depth, min_size, n_features):
	root = get_split(train, n_features)
	split(root, max_depth, min_size, n_features, 1)
	return root

In [26]:
#Make a Prediction With a Decision Tree
def predict(node, row):
	if row[node['index']] < node['value']:
		if isinstance(node['left'], dict):
			return predict(node['left'], row)
		else:
			return node['left']
	else:
		if isinstance(node['right'], dict):
			return predict(node['right'], row)
		else:
			return node['right']
 

In [27]:
#Create a Random Subsample From the Dataset with Replacement
def subsample(df, ratio):
	sample = list()
	n_sample = round(len(df) * ratio)
	while len(sample) < n_sample:
		index = randrange(len(df))
		sample.append(df[index])
	return sample

In [28]:
#Make a Prediction with a List of Bagged Trees
def bagging_predict(trees, row):
	predictions = [predict(tree, row) for tree in trees]
	return max(set(predictions), key=predictions.count)

In [29]:
#Random Forest Algorithm
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
	trees = list()
	for i in range(n_trees):
		sample = subsample(train, sample_size)
		tree = build_tree(sample, max_depth, min_size, n_features)
		trees.append(tree)
	predictions = [bagging_predict(trees, row) for row in test]
	return(predictions)

In [30]:
#Test the Random Forest Algorithm
seed(2)

In [37]:
#Load and Prepare Data. We Will Do This Again Just to Check Off All the Boxes
df=data
df


OrderedDict([('log2cpm',
                               AACGGTACCTTCGC_1  AGAGAAACGCCCTT_1  AGGCAGGACGAATC_1  \
              rownames                                                                
              ENSG00000228463          0.000000          0.000000               0.0   
              ENSG00000230021          0.000000          0.000000               0.0   
              ENSG00000237491          0.000000          0.000000               0.0   
              ENSG00000177757          0.000000          0.000000               0.0   
              ENSG00000225880          0.000000          0.000000               0.0   
              ...                           ...               ...               ...   
              ENSG00000212907          0.000000          0.000000               0.0   
              ENSG00000198886          3.187171          4.279238               0.0   
              ENSG00000198786          0.000000          2.330691               0.0   
              ENSG

In [None]:
#Important Note: Take Data and Turn It Into An Excel File or Comma Seperated Value Data. We Will Have to Restructure The Data & Then Run This Tutorial Again. 

#URL Link on How to Convert a String with Decimals to An Integer in Python: https://www.kite.com/python/answers/how-to-convert-a-string-with-decimals-to-an-integer-in-python

#URL Link on How to Fix "Key Error: 0" Python with a Dictionary: https://wholeblogs.com/fix-key-error-0-python-with-a-dictionary/

