In [None]:
import numpy as np
import numpy.linalg as la ## used for linear algebra things like inverse matrix, etc.

In [None]:
a = np.array([[1,2], [3,4]], dtype="i") 

In [None]:
print(a)

All Markdown blocks correspond to the code above them; only the headings are exceptions.

# Dtype and Dimensions

In [None]:
a.dtype # shows the type the array holds 

In [None]:
a.ndim # shows how many dimensions the array has

#### *If you want to create a multi-dimensional array (like a 2D array), all the inner arrays must have the same number of elements.*

#### *For unequal number of elements just set dtype = object*


In [None]:
# Example of code that doesn't work when the inner arrays have an unequal number of elements.
# b = np.array([[1,2], [1, 2, 3]])
# b.ndim

In [None]:
c = np.array([[[1,2], [54, 56]], [[50, 40], [30, 20]]])
print(c[0,1,1]) # Indexes can be listed using commas, one after another

# Shape and Size

In [None]:
print(c.shape)  # Returns a tuple representing the array dimensions: (2D arrays, 1D arrays, elements per 1D array)
print(c.shape[0]) # Retrieves the first dimension from the shape tuple (number of 2D arrays)

In [None]:
c.size # shows how many elements are in the whole array

# Arrange, Random, and Reshape

In [None]:
a = np.arange(20, 100) # create an array with values from a to b-1  [or just from 0 to a, if b not given]
a

In [None]:
a = np.arange(20, 100, 3) # can make it with a jump of three
a

In [None]:
b = np.random.permutation(a) # rearranges all values in a random fashion
b

#### *Can create random numbers on .random with .rand() or .randn()*

#### *Can use np.zeros and np.ones to create arrays with zeroes or ones* 

In [None]:
c = np.arange(100).reshape(4, 25) # makes the array into a 2-dimensional array with 4 subarrays, with 25 elements each.
c

# Slicing and Sorting

#### *When you create a slice from something, the slice doesn’t make a copy of the data. Instead, it points to the same memory location as the original. This means that if you modify the slice, you’re also changing the original data.*

In [None]:
D = np.arange(100)

print(D[3:10]) ## from 3 to 9

print(D[::5]) ## every fifth element

print(D[::-5]) ## every fifth element from the end 


print()


E = np.round(10*np.random.rand(5,4))

print(E[:, 1:3]) ## returns a submatrix with all rows and sliced columns

print(E[1:3, 1:3]) ## returns a submatrix with slicing

#### *[::5] means “take every 5th element from start to end, without specifying a start or end index."*

#### *":" means everything*

In [None]:
print(E)

print()

print(E.T) ## creates a transpose

In [None]:
E.sort(axis=0) ## sorts the columns with axis = 1, it sorts the rows (affects the original array, works for more dimensions)
E

#### *.sort() can be used on 1-dimensional arrays, just don't give it an axis.*

# Masking and Broadcasting

#### *Makes a copy of the data, so it doesn't change the original*

In [None]:
F = np.arange(20, 25)

## using indexing with double [] -> F[index_array]
print(F[[0, 3, 4]]) ## can get specific indexes

print(F[[True, False, True, False, False]]) ## use True/False list to filter data, returns only True positions.

## using conditions F[F<8]
print(F[F<23]) ## filters all that are below 23

#### *Can use &, 'and' in the conditions. & - used for arrays, 'and' used for single objects.*

In [None]:
G = np.arange(40, 60).reshape(2, 10)
print(G + 5) ## adds five to all columns of the matrix, for any dimensional matrix (can even add arrays this way)

Z = np.arange(2, 8).reshape(2, 3)
W = np.arange(14, 20).reshape(2, 3)

print()

print(np.hstack((Z,W))) ## concats two arrays horizontally

print()

print(np.vstack((Z,W))) ## concats two arrays vertically

#### *Can use .concatenate() for joining on existing axis*

In [None]:
import pandas as pd

# Pandas Series and DataFrames

In [None]:
A = pd.Series([1,2,3,4,5], index=["a", "b", "c", "d", "e"]) ## just an array, can make custom indexes
print(A)

In [None]:
print(A.values)
print(A.index)

#### *For explicit indexing, it includes the final element too. (indexing works the same as numpy)*

#### *Can create a series with a dictionary, just pass it to pd.Series()*

In [None]:
A = pd.Series([1,2,3,4,5], index=["a", "b", "c", "d", "e"]) 
B = pd.Series([10, 15, 25, 35, 600], index=["a", "b", "c", "d", "e"])

C = pd.DataFrame({"A": A, "B": B}) ## use for multidimensional array     [key -> column, value -> populates the column]
print(C)

print()

print(C.T)

In [None]:
print(C.values)
print(C.index)

print(C.values[2, 0]) ## use .values for slicing 

print(C.columns)

C['D'] = C['B'] / 90 ## create a new column like in dictionaries. 
print(C['D'])

del C["D"] ## delete like in dictionaries.

# Missing values and using .loc, .iloc

In [None]:
D = pd.DataFrame([{'a': 1, "b": 4}, {'b': -3, "c": 9}])
D

In [None]:
D = D.fillna(2) ## fills with fixed values

D = D.dropna(axis=1) ## deletes columns/rows if NaN in them 
D

In [None]:
data = pd.Series(["a", "b", "c"], index=[3,0,5])

print(data.loc[0])
print(data.iloc[0])

print()

print(data.loc[5])
try:
    print(data.iloc[5])
except:
    print("Error!")

print()

print(data.loc[0:5])
print(data.iloc[0:5])

#### *Use .loc for explicit indexing, and .iloc for implicit indexing (explicit --> created by you)*

#### *By default pandas uses implicit indexing*

# Matplotlib, Seaborn, and Bokeh

#### *Can create interactive plots with Bokeh*

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from bokeh.plotting import figure, output_file, show

In [None]:
x = np.linspace(0, 10, 1000) ## creates 1000 equally spaced points between 0 and 10 
plt.plot(x, np.sin(x))
plt.plot(x, np.cos(x), c="red")

In [None]:
plt.scatter(x[::20], np.sin(x[::20]), c="black") ## scatters the points

#### *Can customize the plot a lot with different inputs.*

In [None]:
plt.style.use('classic')
x = np.linspace(0, 100, 1000)
y = np.cumsum(np.random.rand(1000,5), axis=0)
plt.plot(x, y)
plt.legend("ABCDE", ncol = 2, loc = "upper left")

#### *np.cumsum() calculates the cumulative sum of array elements, meaning each position contains the sum of all elements from the beginning up to that position.*

In [None]:
sns.set_theme() ## looks more stylish
plt.plot(x,y)
plt.legend("ABCD", ncol = 2, loc = "upper left")

In [None]:
sns.set_theme()
X = pd.Series(50*np.random.randn(5000))
Y = pd.Series(200*np.random.randn(5000))
Z = pd.Series(100*np.random.randn(5000)+500)
data = pd.DataFrame({'X':X, 'Y':Y, 'Z':Z})
print(data.head())
for col in data.columns:
    # plt.hist(data[col], density=True, alpha=0.5) ## creates a histogram
    sns.kdeplot(data[col], fill=True) ## creates a kde plot

In [None]:
## Main plot: Line pattern = correlation, Circle pattern = no correlation.
## the top and right -- shows individual distributions of each variable
sns.jointplot(x="X", y="Y", data=data, kind="hist")

In [None]:
## diagonally - shows the distribution of each individual variable
## non-diagonally -- relationships between each pair of variables
sns.pairplot(data, kind="hist")

# Learn Regression, Dummy Data, and Data Split

In [None]:
import sklearn

In [None]:
## Linear Regression
x = np.arange(5, 30)
y = 4 * x - 7 + (5*np.random.randn(25))
plt.scatter(x, y)

## fit_intercept -- should it calculate the bias
regressor = sklearn.linear_model.LinearRegression(fit_intercept=True)

## x[:, np.newaxis] -- used for making it into a 2d array
regressor.fit(x[:, np.newaxis], y)


yfix = regressor.predict(x[:, np.newaxis])

plt.plot(x, yfix)


In [None]:
from sklearn.datasets._samples_generator import make_blobs ## generates dummy data for testing

## n_samples -> how many in a cluster; centers -> how many clusters; cluster_std = the standard deviation in a cluster
## X -> the cordinates 2d array, y -> cluster labels 1d array
X, y = make_blobs(n_samples=500, centers=3, cluster_std=1.2)

## puts rows, columns 
plt.scatter(X[:, 0], X[:, 1], c=y, cmap="autumn")

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [None]:
from sklearn.svm import SVC

In [None]:
mdl = SVC(gamma = "auto")
mdl.fit(x_train, y_train)

y_predict = mdl.predict(x_test)


acc = 100 * np.sum(y_predict == y_test) / y_test.size

print(acc)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
randomForest = RandomForestClassifier(n_estimators=20)

randomForest.fit(x_train, y_train)

y_predict = randomForest.predict(x_test)

acc = 100 * np.sum(y_predict == y_test) / y_test.size

print(acc)