## Stage 3: Python Fundamentals

### NUMPY: NUMERICAL LOADING

In [8]:
# NumPy - Python Library for numerical computing
import numpy as np

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [9]:
# SYNTAX FOR ARRANGE
# np.arange(stop): This function generates an array of numbers starting from 0 and incrementing by 1,
# up to (but not including) the stop value. It's a convenient, highly optimized way to create a sequential array.
# numpy.arange([start, ]stop, [step, ]dtype=None, *, like=None)
# SYNTAX FOR ARRAY
# np.array([list]): This function takes an existing Python list or other array-like object and converts it into a NumPy
# array. You have to manually type out all the elements.

my_array = np.arange(1000000)
print(my_array)

[     0      1      2 ... 999997 999998 999999]


In [10]:
# Only stop
a = np.arange(5)
# → [0 1 2 3 4]
print(a)

# Start and stop
a = np.arange(2, 7)
# → [2 3 4 5 6]
print(a)

# Start, stop, step
a = np.arange(0, 10, 2)
# → [0 2 4 6 8]
print(a)

# With float step
a = np.arange(1.0, 2.0, 0.2)
# → [1.  1.2  1.4  1.6  1.8]
print(a)

# With dtype
a = np.arange(0, 5, dtype=float)
# → [0. 1. 2. 3. 4.]
print(a)

[0 1 2 3 4]
[2 3 4 5 6]
[0 2 4 6 8]
[1.  1.2 1.4 1.6 1.8]
[0. 1. 2. 3. 4.]


In [11]:
# 1D → 2D (rows × cols)
# USING RESHAPE FOR DIMENSIONS
arr_2d = np.arange(12).reshape(3, 4) 
print("ARRAY: ")
print(arr_2d)
print("DIMENSIONS: ", arr_2d.shape)


ARRAY: 
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
DIMENSIONS:  (3, 4)


In [12]:
# VECTORIZATION
# Multiplies every values in the array with 2
vectorized_result = my_array * 2
print(vectorized_result)
arr_2d *= 2
print(arr_2d)

[      0       2       4 ... 1999994 1999996 1999998]
[[ 0  2  4  6]
 [ 8 10 12 14]
 [16 18 20 22]]


In [13]:
# VECTORIZATION VS LOOPING
# LOOPING IS MORE LENGTHY SYNTAX 

# Compare Performance: Use the %%timeit magic command in Jupyter to show the difference. 
# You will see the vectorized version is orders of magnitude faster.

loop_result = []
for x in my_array:
    loop_result.append(x * 2)

In [14]:
%%timeit
my_array * 2

507 μs ± 24.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [15]:
%%timeit
loop_result = [x * 2 for x in my_array]

36.6 ms ± 253 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
### DATASET LOADING & MANIPULATION/DATA FRAMES: PANDAS 

In [17]:
import pandas as pd

In [18]:
df = pd.read_csv('../data/starter_data.csv')

In [19]:
print(df)

   id category  value  profit   market_cap
0   1     Tech   23.5     1.2  12000000000
1   2  Finance   15.8    -0.5   8500000000
2   3     Tech   45.2     3.1  15500000000
3   4   Energy   10.1     0.2   4200000000
4   5  Finance   12.7    -0.1   7800000000
5   6     Tech   30.4     1.8  14300000000
6   7   Energy   18.9     0.7   5100000000
7   8     Tech   55.6     4.5  18800000000
8   9  Finance   20.0     0.4   9200000000
9  10   Energy    8.5    -0.3   3700000000


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          10 non-null     int64  
 1   category    10 non-null     object 
 2   value       10 non-null     float64
 3   profit      10 non-null     float64
 4   market_cap  10 non-null     int64  
dtypes: float64(2), int64(2), object(1)
memory usage: 532.0+ bytes


In [7]:
df.head()

Unnamed: 0,id,category,value,profit,market_cap
0,1,Tech,23.5,1.2,12000000000
1,2,Finance,15.8,-0.5,8500000000
2,3,Tech,45.2,3.1,15500000000
3,4,Energy,10.1,0.2,4200000000
4,5,Finance,12.7,-0.1,7800000000


### EXPLORATORY DATA ANALYSIS USING PANDAS AND NUMPY

In [None]:
# .describe() give you a quick look at the central tendency, dispersion, and shape of your data's distribution,
# helping you spot potential issues like outliers or missing values
summary_stats = df.describe()