# Romain's Notebook - Section 1

## Libraries Loading

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
import openbabel
import pybel
from collections import Counter
import pyprind

## Data Loading

In [4]:
df_train = pd.read_csv("Data/train")
df_test = pd.read_csv("Data/test")

In [8]:
df,_ = train_test_split(df_train, train_size=0.1)
print df.shape
X = df.drop(['gap'], axis=1)
y = df.gap.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

(100000, 258)


## Feature Design

In [18]:
def MakeSomeFeatures(df):
    #df = df_train.head(10)
    df.index = df.smiles.values
    mybar = pyprind.ProgBar(df.shape[0])

    for s in df.smiles.values:
        mol = pybel.readstring("smi",s)
        df.loc[s, 'weight'] = mol.molwt
        df.loc[s, 'energy'] = mol.energy
        df.loc[s, 'dim']    = mol.dim
        df.loc[s, 'spin']   = mol.spin
        df.loc[s, 'charge'] = mol.charge

        # Get the Atomic Structure
        for x in Counter([t.type for t in mol.atoms]).items():
            df.loc[s, x[0]] = x[1]    

        # Update the bar
        mybar.update()
    df = df.reset_index()
    df = df.drop(['index'], axis = 1)
    return df
    

In [None]:
X_train = MakeSomeFeatures(X_train)

0%                          100%
[                              ]

## TEST ZONE ##

##### Usage of the package : OpenBabel

In [None]:
mols = df_train.loc[0:1, 'smiles'].values

# Read a Molecule
a = pybel.readstring("smi",mols[0])
b = pybel.readstring("smi",mols[1])

In [None]:
a

In [None]:
b.charge

In [None]:
print('Molecule : '), a
print('Weight :'), a.molwt
print('FingerPrint:'), a.calcfp().bits[0:10]
print('Atomic Structure:'), Counter([t.type for t in a.atoms])
print('Energy'), a.energy
print('Dim'), a.dim
print('Spin'), a.spin

print('tanimoto A|B'), a.calcfp()|b.calcfp()

In [None]:
test = a.conformers

In [None]:
a.spin

## RF Test

In [None]:
#store gap values
Y_train = df_train.gap.values
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [None]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()

#Drop the 'smiles' column
df_all = df_all.drop(['smiles'], axis=1)
vals = df_all.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape
print "Test features:", X_test.shape

LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred = LR.predict(X_test)

print('INSAMPLE RMSE'),
print mean_squared_error(LR.predict(X_train), Y_train)

## Various Links

http://stats.stackexchange.com/questions/56010/predicting-chemical-property-boiling-point-from-a-smiles-string

http://openbabel.org/docs/dev/Features/Fingerprints.html

http://openbabel.org/wiki/Tutorial:Fingerprints

https://pypi.python.org/pypi/openbabel

https://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html

/usr/local/Cellar/open-babel/2.3.2



## Command Lines

brew install open-babel

pip install openbabel

In [None]:
import openbabel

In [None]:
import pybel
smiles = ['CCCC', 'CCCN']
mols = [pybel.readstring("smi", x) for x in smiles] # Create a list of two molecules
fps = [x.calcfp() for x in mols] # Calculate their fingerprints
print fps[0].bits, fps[1].bits
print fps[0] | fps[1] # Print the Tanimoto coefficient

class pybel.Molecule(OBMol)
Represent a Pybel Molecule.

Required parameter:
OBMol – an Open Babel OBMol or any type of Cinfony Molecule
Attributes:
atoms, charge, conformers, data, dim, energy, exactmass, formula, molwt, spin, sssr, title, unitcell.
Methods:
addh(), calcfp(), calcdesc(), draw(), localopt(), make3D(), removeh(), write()
The underlying Open Babel OBMol can be accessed using the attribute: