# Preprocessing for Machine Learning

## Data Preprocessing: Initial steps (exploring data types & missing data)

### Exploring missing data:

In [67]:
import pandas as pd

volunteer = pd.read_csv("volunteer.csv")
volunteer.isna().sum()

opportunity_id          0
content_id              0
vol_requests            0
event_time              0
title                   0
hits                    0
summary                 0
is_priority           603
category_id            48
category_desc          48
amsl                  665
amsl_unit             665
org_title               0
org_content_id          0
addresses_count         0
locality               70
region                  0
postalcode              6
primary_loc           665
display_url             0
recurrence_type         0
hours                   0
created_date            0
last_modified_date      0
start_date_date         0
end_date_date           0
status                  0
Latitude              665
Longitude             665
Community Board       665
Community Council     665
Census Tract          665
BIN                   665
BBL                   665
NTA                   665
dtype: int64

### Dropping missing dataset:

In [70]:
# Drop the Latitude and Longitude columns from volunteer
volunteer_cols = volunteer.drop(["Latitude", "Longitude"], axis=1)

# Drop rows with missing category_desc values from volunteer_cols
volunteer_subset = volunteer_cols.dropna(subset=["category_desc"])

# Print out the shape of the subset
print(volunteer_subset.shape)

(617, 33)


### Exploring data types:

In [73]:
volunteer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665 entries, 0 to 664
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   opportunity_id      665 non-null    int64  
 1   content_id          665 non-null    int64  
 2   vol_requests        665 non-null    int64  
 3   event_time          665 non-null    int64  
 4   title               665 non-null    object 
 5   hits                665 non-null    int64  
 6   summary             665 non-null    object 
 7   is_priority         62 non-null     object 
 8   category_id         617 non-null    float64
 9   category_desc       617 non-null    object 
 10  amsl                0 non-null      float64
 11  amsl_unit           0 non-null      float64
 12  org_title           665 non-null    object 
 13  org_content_id      665 non-null    int64  
 14  addresses_count     665 non-null    int64  
 15  locality            595 non-null    object 
 16  region  

### Converting a column type:

In [77]:
# Print the head of the hits column
volunteer["hits"].head()

# Convert the hits column to type int
volunteer["hits"] = volunteer["hits"].astype(int)

# Look at the dtypes of the dataset
volunteer.dtypes

opportunity_id          int64
content_id              int64
vol_requests            int64
event_time              int64
title                  object
hits                    int32
summary                object
is_priority            object
category_id           float64
category_desc          object
amsl                  float64
amsl_unit             float64
org_title              object
org_content_id          int64
addresses_count         int64
locality               object
region                 object
postalcode            float64
primary_loc           float64
display_url            object
recurrence_type        object
hours                   int64
created_date           object
last_modified_date     object
start_date_date        object
end_date_date          object
status                 object
Latitude              float64
Longitude             float64
Community Board       float64
Community Council     float64
Census Tract          float64
BIN                   float64
BBL       

### Training and Test Sets:
- trying to predict the 'category_desc' variable using the other features in the dataset.
- what is the class distribution (and imbalance) for this label? which descriptions occur less than 50 times?

In [81]:
from sklearn.model_selection import train_test_split
volunteer["category_desc"].value_counts() <= 50

category_desc
Strengthening Communities    False
Helping Neighbors in Need    False
Education                    False
Health                       False
Environment                   True
Emergency Preparedness        True
Name: count, dtype: bool

### Stratified sampling:- distribution of class labels in the 'category_desc' column is uneven. To effectively train a model to predict 'category_desc', ensure that the model is trained on a sample of data that is representative of the entire dataset.!

In [None]:
# Create a DataFrame with all columns except category_desc
X = volunteer.drop("category_desc", axis=1)

# Create a category_desc labels dataset
y = volunteer[["category_desc"]]

# Use stratified sampling to split up the dataset according to the y dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Print the category_desc counts from y_train
print(y_train["category_desc"].value_counts())

## Standardization: Scaling & Normalization
- Transform continuous data to appear normally distributed
- 'wine' dataset

### Modeling without normalizing:
- for testing & comparing K-Nearest Neighbors model's accuracy

In [None]:
wine = pd.read_csv("wine_types.csv")

# Split the dataset and labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Create the k-nearest neighbors model
knn = KNeighborsClassifier(n_neighbors=3)  # You can choose the value of K

# Fit the k-nearest neighbors model to the training data
knn.fit(X_train, y_train)

# Score the model on the test data - accuracy
print(knn.score(X_test, y_test)) # result: 0.6666666666666666

### Log Normalization:

In [None]:
# Checking the variance - 'Proline' columns has the largest variance - candidate for log normalization
wine.var()

# Print out the variance of the Proline column
print(wine["Proline"].var())

# Apply the log normalization function to the Proline column
wine["Proline_log"] = np.log(wine["Proline"])

# Check the variance of the normalized Proline column
print(wine["Proline_log"].var())

# Print all
print(wine.var())

### Scaling Data:
- we want 'Ash', 'Alcalinity of ash', and 'Magnesium' columns in the wine dataset to train a linear model, but it's possible that these columns are all measured in different ways, which would bias a linear model.

In [None]:
wine.describe() # The max of Ash is 3.23, the max of Alcalinity of ash is 30, and the max of Magnesium is 162.

# Since we know that the Ash, Alcalinity of ash, and Magnesium columns in the wine dataset are all on different scales
# let's standardize them in a way that allows for use in a linear model.

In [None]:
from sklearn.preprocessing import StandardScaler

# Create the scaler
scaler = StandardScaler()

# Subset the DataFrame you want to scale 
wine_subset = wine[['Ash', 'Alcalinity of ash', 'Magnesium']]

# Apply the scaler to wine_subset
wine_subset_scaled = scaler.fit_transform(wine_subset)

### Standardized Data and Modeling: K-Nearest Neighbors

#### KNN on non-scaled data:
- Before adding standardization, let's look at the accuracy of a K-nearest neighbors model on the wine dataset without standardizing the data.


In [None]:
# Split the dataset and labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Create the k-nearest neighbors model
knn = KNeighborsClassifier(n_neighbors=3) 

# Fit the k-nearest neighbors model to the training data
knn.fit(X_train, y_train)

# Score the model on the test data - accuracy
print(knn.score(X_test, y_test)) # 0.7111111111111111

#### KNN on scaled data:
- The accuracy score on the unscaled wine dataset was decent, but let's use standardization.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Instantiate a StandardScaler
scaler = StandardScaler()

# Scale the training and test features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the k-nearest neighbors model
knn = KNeighborsClassifier(n_neighbors=3) 

# Fit the k-nearest neighbors model to the training data
knn.fit(X_train_scaled, y_train)

# Score the model on the test data
print(knn.score(X_test_scaled, y_test)) # 0.9555555555555556 - Improved from 71.11% to 95.56%

## Feature Engineering:
- Extract and expand information from existing features.

### Encoding categorical variables
- 'hiking.json' dataset
- There are several columns here that need encoding before they can be modeled, one of which is the 'Accessible' column. Accessible is a binary feature, so it has two values, Y or N.

In [None]:
hiking = pd.read_json("hiking.json")

# Set up the LabelEncoder object
enc = LabelEncoder()

# Apply the encoding to the "Accessible" column
hiking["Accessible_enc"] = enc.fit_transform(hiking["Accessible"])

# Compare the two columns
print(hiking[["Accessible", "Accessible_enc"]].head())

### Encoding categorical variables - one-hot
- One of the columns in the volunteer dataset, 'category_desc', gives category descriptions for the volunteer opportunities listed. Because it is a categorical variable with more than two categories, we need to use one-hot encoding to transform this column numerically.

In [None]:
# Transform the category_desc column
category_enc = pd.get_dummies(volunteer["category_desc"])

# Take a look at the encoded columns
print(category_enc.head())

### Aggregating numerical features

In [None]:
# Use .loc to create a mean column
running_times_5k["mean"] = running_times_5k.loc[:, "run1": "run5"].mean(axis=1)

# Take a look at the results
print(running_times_5k.head())

### Extracting datetime components


In [None]:
# First, convert string column to date column
volunteer["start_date_converted"] = pd.to_datetime(volunteer["start_date_date"])

# Extract just the month from the converted column
volunteer["start_date_month"] = volunteer["start_date_converted"].dt.month

# Take a look at the converted and new month columns
print(volunteer[["start_date_converted", "start_date_month"]].head())

### Extracting string patterns


In [None]:
# Write a pattern to extract numbers and decimals
def return_mileage(length):
    
    # Search the text for matches
    mile = re.search(r'\d+\.\d+', length) #'\d+' search for digits, as many as you can. '\.' search for period, and so on.
    
    # If a value is returned, use group(0) to return the found value
    if mile is not None:
        return float(mile.group(0))

# Apply the function to the Length column and take a look at both columns
hiking["Length_num"] = hiking["Length"].apply(return_mileage)
print(hiking[["Length", "Length_num"]].head())

### Vectorizing text


In [None]:
# Take the title text
title_text = volunteer["title"]

# Create the vectorizer method
tfidf_vec = TfidfVectorizer()

# Transform the text into tf-idf vectors
text_tfidf = tfidf_vec.fit_transform(title_text)

print(text_tfidf)

### Text classification using tf/idf vectors


In [None]:
# Split the dataset according to the class distribution of category_desc
y = volunteer["category_desc"]
X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y, random_state=42)

# Fit the model to the training data
nb.fit(X_train, y_train)

# Print out the model's accuracy
print(nb.score(X_test, y_test))

## Feature Selection for Model Creation:
- Selecting features to be used for modeling
- Doesn't create new features
- Improve model's performance

### Selecting relevant features - Removing redundant features

In [None]:
# Create a list of redundant column names to drop
to_drop = ["locality", "region", "category_desc", "created_date", "vol_requests"]

# Drop those columns from the dataset
volunteer_subset = volunteer.drop(to_drop, axis=1)

# Print out the head of volunteer_subset
print(volunteer_subset.head())

### Checking for correlated features

In [None]:
# Print out the column correlations of the wine dataset
print(wine.corr())

# Drop that column from the DataFrame
wine = wine.drop("Flavanoids", axis=1)

print(wine.head())

### Exploring text vectors, part 1
- Let's expand on the text vector exploration method using the volunteer dataset's title tf/idf vectors. In this first part of text vector exploration, we're going to add to that function we learned about in the slides. We'll return a list of numbers with the function. In the next exercise, we'll write another function to collect the top words across all documents, extract them, and then use that list to filter down our text_tfidf vector.

In [None]:
def return_weights(vocab, original_vocab, vector, vector_index, top_n):
    zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data))
    
    # Let's transform that zipped dict into a series
    zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices})
    
    # Let's sort the series to pull out the top n weighted words
    zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
    return [original_vocab[i] for i in zipped_index]

# Print out the weighted words
print(return_weights(vocab, tfidf_vec.vocabulary_, text_tfidf, vector_index=8, top_n=3))

### Exploring text vectors, part 2
- Using the return_weights() function you wrote in the previous exercise, we're now going to extract the top words from each document in the text vector, return a list of the word indices, and use that list to filter the text vector down to those top words.

In [None]:
def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    for i in range(0, vector.shape[0]):
        # here we'll call the function from the previous exercise, 
        # and extend the list we're creating
        filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        filter_list.extend(filtered)
    # Return the list in a set, so we don't get duplicate word indices
    return set(filter_list)

# Call the function to get the list of word indices
filtered_words = words_to_filter(vocab, tfidf_vec.vocabulary_, text_tfidf, top_n=3)

# By converting filtered_words back to a list, 
# we can use it to filter the columns in the text vector
filtered_text = text_tfidf[:, list(filtered_words)]

### Training Naive Bayes with feature selection:
- re-run the Naive Bayes text classification model

In [None]:
# Split the dataset according to the class distribution of category_desc
X_train, X_test, y_train, y_test = train_test_split(filtered_text.toarray(), y, stratify=y, random_state=42)

# Fit the model to the training data
nb.fit(X_train, y_train)

# Print out the model's accuracy
print(nb.score(X_test, y_test))

### Using PCA

In [None]:
# Instantiate a PCA object
pca = PCA()

# Define the features and labels from the wine dataset
X = wine.drop('Type', axis=1)
y = wine["Type"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Apply PCA to the wine dataset X vector
pca_X_train = pca.fit_transform(X_train)
pca_X_test = pca.transform(X_test)

# Look at the percentage of variance explained by the different components
print(pca.explained_variance_ratio_)

### Training a model with PCA


In [None]:
# Fit knn to the training data
knn.fit(pca_X_train, y_train)

# Score knn on the test data and print it out
print(knn.score(pca_X_test, y_test))