**Sample Code Blocks**
The following code blocks contain sample, common code blocks to provide an example of structure for common data analysis tasks.

Please use as a reference when working within a Python document as needed.

**Libray Import**

In [2]:
# Common data analysis libraries to import at the start of any new Python document
import pandas as pd  # Helps with organizing and formatting data.  The "pd" is a shorthand reference we can refer to this package as
import numpy as np   # "Num Py" is a package that contains common mathematical and statistical functions
import matplotlib as mpl # "Mat Plot Lib" is a package that helps with plotting and graphing data
import matplotlib.pyplot as plt  #  "Py Plot" is a sub-package of Mat Plot Lib that particularly helps with plotting and graphing data
from scipy.optimize import curve_fit # This package is be used to have Python complete a linear regression of our proportional data
print("Packages Imported!")

Packages Imported!


**Data Entering and Importing**

In [None]:
# Raw data entry into a Dataframe
data = pd.DataFrame(
    { "radius": [0.5, 1.00, 2.00, __________ ],
     "force": [60.0, 31.0, 15.2, ___________ ]
     })

# To check to see if the data was entered correctly, you can use the "head" function to print a few lines
data.head()
# Use "shift and enter" to run the code and see the output

In [None]:
# Importing data into a Dataframe from a web based source
dataImported = pd.read_csv('web link here')
dataImported.head()

In [None]:
# Import data from a csv file uploaded to the session
# Include this import line at top of the notebook
from google.colab import files

# Code to import the file.  Be sure to have "drag and dropped" your data file into the "Files" section on the left (folder icon)
# If separator is a space, may try using: delim_whitespace=True   instead of sep = ' '
data = pd.read_csv('/content/***FileNameHere***.csv', index_col = None, sep=',', header=0)

data.head(5)

In [None]:
# This block of code will import your own data from your google drive and it will print out part of your data so you can see that it worked
# Open your CSV file in google, google will turn it into a spreadsheet... or open a new google sheets file and import your CSV file.
#Click on the share button and make sure that anyone with the link can edit. Copy the share link
#paste the share link into the quotes below (red)
#change the END of your share link  from /edit?usp=sharing to /export

data = pd.read_excel('https://docs.google.com/spreadsheets/d/1mmK8usfTTMVdkf64XTvgAi7V3jO_rltuOJ9WnSLwrEw/export')
# The .head(n) command displays the first n rows of a file. You can change 101 to display a different number of lines.
data.head(5)

In [None]:
#This code block will upload a file from a Google Drive folder that is NOT a Google Spreadsheet (ie a text file)
#Start by noting the link below, enclosed in single quotes and already entered for you in the code...
#https://drive.google.com/uc?export=download&id=FILEID

#Where "FILEID" is, replace this with the Google Drive File ID.  You get this from the shared link.  If the file was...
#https://drive.google.com/file/d/1xm4CjIfgGb9pDOeBsSTuNR_tjimSNcck/view?usp=sharing
#The File ID for this link is...
#  1xm4CjIfgGb9pDOeBsSTuNR_tjimSNcck

#Note to also updated the "sep" to indicate the separator being used.  Here a blank space is the separator


data = pd.read_csv('https://drive.google.com/uc?export=download&id=FILEID', index_col = None, sep=' ', header=0)

data.head(5)

In [None]:
# Create an empty data frame
emptyData = pd.DataFrame()

In [None]:
# Create an array of evenly spaced values
# np.linspace(min, max, spacingInterval)
newBins = np.linspace(0, 6014103, 60000)

**Data Manipulation and Calculation within a DataFrame**

In [None]:
# Creating a new column name, based on another column
data["NewColumnName"] = data["OldColumnNameWithSpaces"]

In [None]:
# Manipulating Data in a Dataframe
# Works when calculated values do not involve the column being written to
data_3["inverseR"] = 1 / data_3["radius"]  #can do basic math and it will apply to all values in a particular column


In [None]:
# Reference a value in the previous row within a data frame
previousDataRow = data.shift()

In [None]:
# Manipulating Data when the result involves the current column being written to

data_4['z'] = 0.0 #Create empty result column first, with 0.0 values to assure double datatype

for i in range(1,len(a8Data.index)):
  data_4.at[i,'z'] = a8Data.at[i,'x']*constant + a8Data.at[i-1, 'z'] #use a for loop to cycle through, and the "at" method to index appropriately
data_4.head()

In [None]:
# Selecting rows of data in a dataframe based on the numerical value of a column
dataOfInterest = pd.DataFrame() #Make a new data frame to store the data of interest
dataOfInterest = data[data["columnNameToFocus"].between(10,12)]
dataOfInterest.head()

In [None]:
# To Remove Outliers use the code line below (for a data frame called "data")
# This code line defines a condition of what to keep...data that does not meet this condition is then removed

data = data[data['flux']>6800] #keeps values greater than number provided to drop outliers

**Description of Data**

In [None]:
# Overview of the data, statistically
data_5.describe()

In [None]:
# Obtaining specific statistical values & storing them as a variable

data = data.query("NewColumnName > 8 & NewColumnName < 14") # Allows isolation of some data based on a range


In [None]:
# Obtain the average and standard deviation for a set of data
average = data["ColumnName"].mean()
standardDeviation = data["ColumnName"].std()

In [None]:
# Calculate the average and standard deviation across multiple named rows and put the result for each row in a new column
data_5['rowAverage'] = data_5[['firstColumnName','secondColumnName','thirdColumnName']].mean(axis=1)
data_5['rowStdDeviation'] = data_5[['firstColumnName','secondColumnName','thirdColumnName']].std(axis=1)

In [None]:
# Count the number of rows (instances) that satisfy a boolean condition
numberOfRows = np.sum(data['***Column_Of_Interest***'] > ***ComparisonConditionValueHere***)

**Visualizing Data**

In [None]:
# Creating a scatter plot graph
xMin = 0
yMin = 0
xMax = np.max(sampleData["sampleX"])

plt.scatter(x = sampleData["sampleX"], y = sampleData["sampleY"])
plt.axis(xmin = xMin, ymin = yMin)
plt.title("Sample Plot")
plt.xlabel("Sample X")
plt.ylabel("Sample Y")
plt.show()

In [None]:
# Create a scatter plot graph with a best fit line

# Set min and Max values for the plot axis
xMin = 0
yMin = 0
xMax = np.max(sampleData["sampleX"])

# Create the scatter plot of the data
plt.scatter(x = sampleData["sampleX"], y = sampleData["sampleY"], label = "Sample Data")
plt.axis(xmin = xMin, ymin = yMin)

# Code to create best fit line is here
def linearFunc(x,intercept,slope):
  y = intercept + slope*x
  return y

a_fit, cov = curve_fit(linearFunc,samlpleData['sampleX'],sampleData['sampleY'])
slope = a_fit[1]
intercept = a_fit[0]

# Create the best fit line from the fit calculation and plot it
yfit = intercept + slope*sampleData['sampleX']
plt.plot(sampleData['sampleX'], yfit,color = "r", label="Fit Line")

# Plot Titles and Legend
plt.title("Sample Plot")
plt.xlabel("Sample X")
plt.ylabel("Sample Y")
plt.legend()

# Show the Graph, Print the Best Fit Line Equation
plt.show()
print("y=%.6fx+%.6f"%(slope, intercept)) #Code to print the resulting best fit equation.  Should show below the graph

In [None]:
# Create a Scatter Plot with Y-Variable Error Bars

xMin = 0
yMin = 0
xMax = np.max(sampleData["sampleX"])

plt.errorbar(sampleData["sampleX"], sampleData["sampleY"], yerr=sampleData["y_error"], fmt ='bo', capsize = 10)
plt.axis(xmin = xMin, ymin = yMin)
plt.title("Sample Plot")
plt.xlabel("Sample X")
plt.ylabel("Sample Y")
plt.show()

In [None]:
# Create a Scatter Plot with Error Bars, Best Fit Line, and Calculated Slope and Intercept Uncertainty
# Uncertainty is based on the Y-Axis uncertainty only

# Set min and Max values for the plot axis
xMin = 0
yMin = 0
xMax = np.max(sampleData["sampleX"])

# Create the scatter plot of the data
plt.errorbar(x = sampleData["sampleX"], y = sampleData["sampleY"], yerr = sampleData["sampleY_error"], fmt = 'bo', capsize = 10, label = "Sample Data")
plt.axis(xmin = xMin, ymin = yMin)

# Code to create best fit line is here
def linearFunc(x,intercept,slope):
  y = intercept + slope*x
  return y

a_fit, cov = curve_fit(linearFunc,samlpleData['sampleX'],sampleData['sampleY'], sigma=sampleData["sampleY_error"], absolute_sigma = True)
slope = a_fit[1]
intercept = a_fit[0]
d_intercept = np.sqrt(cov[0][0])
d_slope = np.sqrt(cov[1][1])

# Create the best fit line from the fit calculation and plot it
yfit = intercept + slope*sampleData['sampleX']
plt.plot(sampleData['sampleX'], yfit,color = "r", label="Fit Line")

# Plot Titles and Legend
plt.title("Sample Plot")
plt.xlabel("Sample X")
plt.ylabel("Sample Y")
plt.legend()

# Show the Graph, Print the Best Fit Line Equation
plt.show()
print("y=(%.6f +/- %.6f)x+(%.6f +/- %.6f)"%(slope,d_slope, intercept, d_intercept)) #Code to print the resulting best fit equation.  Should show below the graph


In [3]:
# Create a Histogram

plt.hist(data['ColumnName'],range=[______,________], bins=_________, log=False)  # makes the histogram
plt.title("Time Between _____ to _______ at __________")
plt.xlabel("Time (in ns)")
plt.ylabel("number of events")
plt.grid(False);

NameError: name 'data' is not defined

In [None]:
# Creating a bar chart with multiple, grouped bars side by side

personA = pd.DataFrame()
personB = pd.DataFrame()
personA = data[data.Person == "A"]
personB = data[data.Person == "B"]

labels = personA['Sense'].tolist()
x = np.arange(len(labels))
width = len(labels)*0.1

plt.bar(x -(width*0.5), personA['StudentReportedTime'], width = width, label = "Student A")
plt.bar(x + (width*0.5), personB['StudentReportedTime'], width = width, label  = "Student B")

plt.xticks(x, labels)
plt.xlabel('Senses')
plt.ylabel("Reaction Time (s)")
plt.legend()

plt.show()

**Exporting Data**

In [None]:
# Export a dataframe to a csv file within the "Files" section (Folder Icon on the left)
data.to_csv("***DesiredFileNameHere***.csv")

In [None]:
# Write line to a plain text file

with open('/content/Example2.txt', 'w') as writefile:
    writefile.write("This is line A")

In [None]:
# Write a new line to text file, appending.  Note the 'a' term change

with open('/content/Example2.txt', 'a') as testwritefile:
    testwritefile.write("This is line C\n")

In [None]:
# Read file

with open('/content/Example2.txt', 'r') as testwritefile:
    print(testwritefile.read())

# Old, Legacy Code Blocks

In [None]:
# Create a scatter plot with a best fit line

xMin = 0
yMin = 0
xMax = np.max(sampleData["sampleX"])

plt.scatter(x = sampleData["sampleX"], y = sampleData["sampleY"])
plt.axis(xmin = xMin, ymin = yMin)
plt.title("Sample Plot")
plt.xlabel("Sample X")
plt.ylabel("Sample Y")

# Code to create best fit line is here
slope, intercept = np.polyfit(sampleData["sampleX"], sampleData["sampleY"], 1)
xValues = np.arange(xMin, xMax, (xMax - xMin)/200) # Creates a set of 200, evenly spaced "x" values
plt.plot(xValues, slope*xValues + intercept, color = "r") # Plots the x values and calculates the "y" values based on the best fit line result
plt.show()

print("y=%.6fx+%.6f"%(slope, intercept)) #Code to print the resulting best fit equation.  Should show below the graph