In [100]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.cross_validation import cross_val_predict

In [101]:
#loading csv file
df = pd.read_csv('gaussian-runtime-v1.csv')

In [102]:
#filtering data - removing rows which has zero as memory, nproc and calc machine variables
df = df[(df.machine != 0) & (df.memory != 0) & (df.nproc != 0)]
#dim(df) = (4533,100)
df = df.drop('id',1)

In [103]:
#specifies the parameters of our graphs
fig = plt.figure(figsize=(18,6)) 
alpha=alpha_scatterplot = 0.2 
alpha_bar_chart = 0.55

#plots a kernel density estimate of the  runtime
ax1 = plt.subplot2grid((2,3),(0,0))
df.runtime.plot(kind='kde')
ax1.set_xlim(0, 2000000)
plt.xlabel("running time")    
plt.title("Running time distribution")

#plots a kernel density estimate of the  runtime
ax2 = plt.subplot2grid((2,3),(0,1))
df.natoms.plot(kind='kde')
ax2.set_xlim(0, 150)
plt.xlabel("number of atoms")    
plt.title("Number of atoms distribution")

ax3 = plt.subplot2grid((2,3),(0,2))
plt.scatter(df.natoms, df.runtime, alpha=alpha_scatterplot)
ax3.set_xlim(0,150)
ax3.set_ylim(0,2000000)
total_bins = 10
bins = np.linspace(df.natoms.min(),df.natoms.max(), total_bins)
delta = bins[1]-bins[0]
idx  = np.digitize(df.natoms,bins)
running_mean = [np.mean(df.runtime[idx==k]) for k in range(total_bins)]
plt.plot(bins-delta/2,running_mean,'r--',lw=4,alpha=.8)
plt.xlabel("number of atoms")
plt.ylabel("runtime")
#formats the grid line style of our graphs                          
plt.grid(b=True, which='major', axis='y')  
plt.title("Running time by number of atoms in the simulation")

#plots a kernel density estimate of the  max memory
ax4 = plt.subplot2grid((2,3),(1,0))
df.memory.plot(kind='kde')
ax4.set_xlim(0, 20000)
plt.xlabel("memory")    
plt.title("Memory Distribution")

#plots a kernel density estimate of the  nproc
ax5 = plt.subplot2grid((2,3),(1,1))
df.nproc.value_counts().sort_index().plot(kind='bar', alpha=alpha_bar_chart)
ax5.set_xlim(-1, len(df.nproc.value_counts()))
plt.xlabel("number of process")    
plt.title("Number of Processes")

plt.subplot2grid((2,3),(1,2))
ax6 = plt.scatter(df.runtime, df.opt, alpha=alpha_scatterplot)
# sets the y axis lable
plt.ylabel("opt")
# formats the grid line style of our graphs                          
plt.grid(b=True, which='major', axis='y')  
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off') # labels along the bottom edge are off
plt.title("Runtime by 'opt'")

plt.tight_layout()
plt.show()

In [69]:
#----------------------------Job Types--------------------------------
fig = plt.figure(figsize=(18,6)) 
plt.subplot2grid((3,3),(0,0))
plt.scatter(df.runtime, df.density, alpha=alpha_scatterplot)
# sets the y axis lable
plt.ylabel("density")
# formats the grid line style of our graphs                          
plt.grid(b=True, which='major', axis='y')  
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off') # labels along the bottom edge are off
plt.title("Runtime by 'density'")

plt.subplot2grid((3,3),(0,1))
plt.scatter(df.runtime, df.freq, alpha=alpha_scatterplot)
# sets the y axis lable
plt.ylabel("freq")
# formats the grid line style of our graphs                          
plt.grid(b=True, which='major', axis='y')  
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off') # labels along the bottom edge are off
plt.title("Runtime by 'freq'")

plt.subplot2grid((3,3),(0,2))
plt.scatter(df.runtime, df.guess, alpha=alpha_scatterplot)
# sets the y axis lable
plt.ylabel("guess")
# formats the grid line style of our graphs                          
plt.grid(b=True, which='major', axis='y')  
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off') # labels along the bottom edge are off
plt.title("Runtime by 'guess'")

plt.subplot2grid((3,3),(1,0))
plt.scatter(df.runtime, df.fopt, alpha=alpha_scatterplot)
# sets the y axis lable
plt.ylabel("fopt")
# formats the grid line style of our graphs                          
plt.grid(b=True, which='major', axis='y')  
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off') # labels along the bottom edge are off
plt.title("Runtime by 'fopt'")

plt.subplot2grid((3,3),(1,1))
plt.scatter(df.runtime, df.opt, alpha=alpha_scatterplot)
# sets the y axis lable
plt.ylabel("opt")
# formats the grid line style of our graphs                          
plt.grid(b=True, which='major', axis='y')  
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off') # labels along the bottom edge are off
plt.title("Runtime by 'opt'")

plt.subplot2grid((3,3),(1,2))
plt.scatter(df.runtime, df.sp, alpha=alpha_scatterplot)
# sets the y axis lable
plt.ylabel("sp")
# formats the grid line style of our graphs                          
plt.grid(b=True, which='major', axis='y')  
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off') # labels along the bottom edge are off
plt.title("Runtime by 'sp'")

plt.subplot2grid((3,3),(2,0))
plt.scatter(df.runtime, df.oniom, alpha=alpha_scatterplot)
# sets the y axis lable
plt.ylabel("oniom")
# formats the grid line style of our graphs                          
plt.grid(b=True, which='major', axis='y')  
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off') # labels along the bottom edge are off
plt.title("Runtime by 'oniom'")

plt.subplot2grid((3,3),(2,1))
plt.scatter(df.runtime, df.scrf, alpha=alpha_scatterplot)
# sets the y axis lable
plt.ylabel("scrf")
# formats the grid line style of our graphs                          
plt.grid(b=True, which='major', axis='y')  
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off') # labels along the bottom edge are off
plt.title("Runtime by 'scrf'")

plt.subplot2grid((3,3),(2,2))
plt.scatter(df.runtime, df.p, alpha=alpha_scatterplot)
# sets the y axis lable
plt.ylabel("p")
# formats the grid line style of our graphs                          
plt.grid(b=True, which='major', axis='y')  
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off') # labels along the bottom edge are off
plt.title("Runtime by 'p'")

plt.tight_layout()
plt.show()


In [99]:
lr = linear_model.LinearRegression()
#x = df[list(df.columns[0:64])]
x = df.nmo.reshape((4647,1))
y = df.runtime.reshape((4647,1))

# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validated:
predicted = cross_val_predict(lr, x, y, cv=10)

fig, ax = plt.subplots()
ax.scatter(y, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

In [97]:
pd.set_option('display.max_colwidth', -1)
df[df.runtime > 4000000]['id']

235     Fcac-freq.trestles.sdsc.teragrid.org.2534632.150304      
242     l209hf.trestles.sdsc.teragrid.org.2442742.150112         
589     mj12.trestles.sdsc.teragrid.org.2499371.150217           
1024    mt14.trestles.sdsc.teragrid.org.2501540.150218           
1242    mmj1.trestles.sdsc.teragrid.org.2576077.150413           
1405    mt1.trestles.sdsc.teragrid.org.2500254.150217            
1733    l207b32-final.trestles.sdsc.teragrid.org.2511262.150221  
1991    mt5.trestles.sdsc.teragrid.org.2500407.150217            
3337    mj40.trestles.sdsc.teragrid.org.2576037.150413           
3412    mj42.trestles.sdsc.teragrid.org.2576041.150413           
3978    TS6PhOCH3u-freq.trestles.sdsc.teragrid.org.2536531.150306
4280    l207hf1.trestles.sdsc.teragrid.org.2439869.150108        
4372    Ettser-b3.trestles.sdsc.teragrid.org.2439507.150107      
4643    mj17.trestles.sdsc.teragrid.org.2499380.150217           
Name: id, dtype: object