# Messy Data Sample Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('./DoubleMuon.csv')
data.head(10)

In [None]:
# The .shape command displays the (number of rows , number of columns) in a file.
data.shape

In [None]:
# file has two muons from same event on separate lines.
# to make each event on a single line, with both muon properties:
zeros = data.loc[data['subentry'] == 0].head().copy()
ones = data.loc[data['subentry'] == 1].head().copy()
new_data = data.loc[data['subentry'] == 0].copy()
new_data = new_data.merge(data.loc[data['subentry'] == 1].copy(), on='entry', how='left')

# tidy up columns
new_data = new_data.drop(columns=['entry', 'nMuon_x', 'nMuon_y', 'subentry_x', 'luminosityBlock_x',
       'pfRelIso03_all_x', 'pfRelIso04_all_x', 'tightId_x', 'softId_x',
       'dxy_x', 'dxyErr_x', 'dz_x', 'dzErr_x', 'subentry_y', 'run_y',
       'luminosityBlock_y', 'event_y', 'pfRelIso03_all_y', 'pfRelIso04_all_y',
       'tightId_y', 'softId_y', 'dxy_y', 'dxyErr_y', 'dz_y', 'dzErr_y'])

# shorter dataframe name, calculate inv. mass and net charge
df = new_data.copy()
df['mass'] = np.sqrt(2*df.pt_x*df.pt_y*(np.cosh(df.eta_x - df.eta_y) - np.cos(df.phi_x - df.phi_y)))
df['net_q'] = df['charge_x'] + df['charge_y']

df.head()

In [None]:
# comparing cuts
plotdata = df.loc[df['net_q'] == 0]
plotdata2 = df.loc[(df['net_q'] == 0) & (np.abs(df['eta_x']) < 1) & (np.abs(df['eta_y']) < 1.5)]

nbins = 80
xrange = (8,16)
plt.hist(df['mass'], histtype='step', bins=nbins, range=xrange, label="all events")
plt.hist(plotdata['mass'], histtype='step', bins=nbins, range=xrange, label="zero net Q")
plt.hist(plotdata2['mass'], histtype='step', bins=nbins, range=xrange, label="zero net Q & |eta| < 1.5")

plt.legend(framealpha=1, fancybox=True, loc='upper right')
plt.xlabel('M [GeV]')
plt.yscale('log')
plt.show()