# Data processing and cleaning

In [56]:
import pandas as pd
import numpy as np

In [57]:
#Load the data from the CSV file
df = pd.read_csv('householddata.csv')

In [58]:
#Convert the date columns to datetime objects
df['Start Time'] = pd.to_datetime(df['Start Time'])
df['End Time'] = pd.to_datetime(df['End Time'])

In [59]:
# Remove rows with missing values
df.dropna(inplace=True)

In [60]:
# Drop unnecessary columns
df.drop(['Title', 'Description', 'Location', 'Review'], axis=1, inplace=True)

In [61]:
# Convert the categorical variables to numerical variables
df['Status'] = pd.Categorical(df['Status'], categories=['requested', 'in-progress', 'completed', 'rejected'], ordered=True)
df['Status'] = df['Status'].cat.codes

In [62]:
# Convert the usernames to numerical IDs
df['Customer ID'] = df['Customer Username'].astype('category').cat.codes
df['Worker ID'] = df['Worker Username'].astype('category').cat.codes
df.drop(['Customer Username', 'Worker Username'], axis=1, inplace=True)

In [63]:
# Display the processed data
print(df.head())

            Start Time            End Time  Status  Rating  Hourly Rate  \
5  2023-10-30 20:00:30 2023-10-30 22:00:30       2     4.0    40.707462   
13 2023-08-06 14:21:21 2023-08-06 16:21:21       2     0.0    31.252497   
14 2023-12-07 19:14:42 2023-12-07 21:14:42       2     1.0    44.312673   
21 2023-01-16 23:52:14 2023-01-17 01:52:14       2     4.0    33.625232   
22 2023-08-26 17:29:17 2023-08-26 19:29:17       2     4.0    22.309144   

    Total Cost  Customer ID  Worker ID  
5    81.414924            5          2  
13   62.504993            5          4  
14   88.625345            5          0  
21   67.250465            5          3  
22   44.618289            5          1  


# Train-test split

In [64]:
#Shuffle the rows of the data
shuffled_data = df.sample(frac=1).reset_index(drop=True)

In [65]:
#Split the data into training and testing sets
num_rows = shuffled_data.shape[0]
train_size = int(0.8 * num_rows)
train_data = shuffled_data.iloc[:train_size]
test_data = shuffled_data.iloc[train_size:]

In [85]:
#Define a function to calculate the Pearson correlation coefficient
def pearson_correlation(x, y):
    x = np.array(x)
    y = np.array(y)
    n = len(x)
    xy = np.multiply(x, y)
    sum_x = sum(x)
    sum_y = sum(y)
    sum_xy = sum(xy)
    sum_x_squared = sum(x ** 2)
    sum_y_squared = sum(y ** 2)
    numerator = n * sum_xy - sum_x * sum_y
    denominator = np.sqrt((n * sum_x_squared - sum_x ** 2) * (n * sum_y_squared - sum_y ** 2))
    return numerator / denominator

In [86]:
#Define a function to predict the rating for a worker based on a customer's ratings and the Pearson correlation coefficients
def predict_rating(worker_ratings, correlation_coefficients, customer_ratings):
    numerator = 0
    denominator = 0
    for i in range(len(worker_ratings)):
        if worker_ratings[i] != 0 and customer_ratings[i] != 0:
            numerator += correlation_coefficients[i] * (customer_ratings[i] - np.mean(customer_ratings))
            denominator += np.abs(correlation_coefficients[i])
    if denominator == 0:
        return 0
    else:
        return np.mean(worker_ratings) + (numerator / denominator)

In [87]:
#Define a function to calculate the mean absolute error (MAE) between the predicted ratings and the actual ratings
def calculate_mae(predictions, actual_ratings):
    n = len(predictions)
    error = sum([abs(predictions[i] - actual_ratings[i]) for i in range(n)])
    return error / n

In [88]:
#Calculate the Pearson correlation coefficients between each pair of customers and workers
num_customers = len(df['Customer ID'].unique())
num_workers = len(df['Worker ID'].unique())
correlation_matrix = np.zeros((num_customers, num_workers))
for i in range(num_customers):
    customer_ratings = train_data[train_data['Customer ID'] == i]['Rating'].tolist()
    for j in range(num_workers):
        worker_ratings = train_data[train_data['Worker ID'] == j]['Rating'].tolist()
        common_ratings = (train_data[train_data['Customer ID'] == i]
                          .merge(train_data[train_data['Worker ID'] == j], on='Start Time', suffixes=('_cust', '_work')))
        x = common_ratings['Rating_cust'].tolist()
        y = common_ratings['Rating_work'].tolist()
        correlation_coefficient = pearson_correlation(x, y)
        correlation_coefficients[i][j] = correlation_coefficient

NameError: name 'correlation_coefficients' is not defined

In [30]:
# Split the data into training and testing sets
num_rows = shuffled_data.shape[0]
train_size = int(0.8 * num_rows)

In [31]:
train_data = shuffled_data.iloc[:train_size]
test_data = shuffled_data.iloc[train_size:]

In [35]:
#Create a dictionary to store the user ratings from the training data
user_ratings = {}
for i in range(train_data.shape[0]):
    customer_id = train_data.loc[i, 'Customer ID']
    worker_id = train_data.loc[i, 'Worker ID']
    rating = train_data.loc[i, 'Rating']
    if customer_id not in user_ratings:
        user_ratings[customer_id] = {}
    user_ratings[customer_id][worker_id] = rating

In [37]:
#Define a function to calculate the Pearson correlation coefficient between two users
from math import sqrt

def pearson_correlation(user1_ratings, user2_ratings):
    shared_items = set(user1_ratings.keys()) & set(user2_ratings.keys())
    num_items = len(shared_items)
    if num_items == 0:
        return 0
    sum1 = sum([user1_ratings[item] for item in shared_items])
    sum2 = sum([user2_ratings[item] for item in shared_items])
    sum1_sq = sum([pow(user1_ratings[item], 2) for item in shared_items])
    sum2_sq = sum([pow(user2_ratings[item], 2) for item in shared_items])
    product_sum = sum([user1_ratings[item] * user2_ratings[item] for item in shared_items])
    numerator = product_sum - (sum1 * sum2 / num_items)
    denominator = sqrt((sum1_sq - pow(sum1, 2) / num_items) * (sum2_sq - pow(sum2, 2) / num_items))
    if denominator == 0:
        return 0
    return numerator / denominator

In [39]:
#Define a function to predict the rating for a given user and item based on the training data
def predict_rating(user_ratings, user, item):
    similarities = []
    for other_user in user_ratings:
        if other_user != user and item in user_ratings[other_user]:
            similarity = pearson_correlation(user_ratings[user], user_ratings[other_user])
            if similarity > 0:
                similarities.append((other_user, similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    if len(similarities) == 0:
        return None
    numerator = sum([user_ratings[sim_user][item] * sim for sim_user, sim in similarities])
    denominator = sum([sim for sim_user, sim in similarities])
    return numerator / denominator

In [41]:
#Loop through the testing data and use the predict_rating function to predict the ratings
predicted_ratings = []
test_data = test_data.reset_index(drop=True)
for i in range(test_data.shape[0]):
    customer_id = test_data.loc[i, 'Customer ID']
    worker_id = test_data.loc[i, 'Worker ID']
    actual_rating = test_data.loc[i, 'Rating']
    predicted_rating = predict_rating(user_ratings, customer_id, worker_id)
    if predicted_rating is not None:
        predicted_ratings.append((actual_rating, predicted_rating))

In [46]:
actual_ratings = list(test_data['Rating'])

#Calculate the accuracy of the predictions using the mean absolute error (MAE)
def calculate_mae(predictions, actual_ratings):
    n = len(predictions)
    error = sum([abs(predictions[i] - actual_ratings[i]) for i in range(n)])
    return error / n

mae = calculate_mae(predicted_ratings, actual_ratings)
print("MAE:", mae)

TypeError: unsupported operand type(s) for -: 'tuple' and 'float'

In [114]:
import pandas as pd
import numpy as np

# Load the data from the CSV file
df = pd.read_csv('householddata.csv')

# Convert the date columns to datetime objects
df['Start Time'] = pd.to_datetime(df['Start Time'])
df['End Time'] = pd.to_datetime(df['End Time'])

# Remove rows with missing values
df.dropna(inplace=True)

# Drop unnecessary columns
df.drop(['Title', 'Description', 'Location', 'Review'], axis=1, inplace=True)

# Convert the categorical variables to numerical variables
df['Status'] = pd.Categorical(df['Status'], categories=['requested', 'in-progress', 'completed', 'rejected'], ordered=True)
df['Status'] = df['Status'].cat.codes

# Convert the usernames to numerical IDs
df['Customer ID'] = df['Customer Username'].astype('category').cat.codes
df['Worker ID'] = df['Worker Username'].astype('category').cat.codes
df.drop(['Customer Username', 'Worker Username'], axis=1, inplace=True)

# Shuffle the rows of the data
shuffled_data = df.sample(frac=1).reset_index(drop=True)

# Split the data into training and testing sets
num_rows = shuffled_data.shape[0]
train_size = int(0.8 * num_rows)
train_data = shuffled_data.iloc[:train_size]
test_data = shuffled_data.iloc[train_size:]

# Define a function to calculate the Pearson correlation coefficient
def pearson_correlation(x, y):
    x = np.array(x)
    y = np.array(y)
    n = len(x)
    xy = x * y
    sum_x = sum(x)
    sum_y = sum(y)
    sum_xy = sum(xy)
    sum_x_sq = sum(x ** 2)
    sum_y_sq = sum(y ** 2)
    numerator = (n * sum_xy) - (sum_x * sum_y)
    denominator = np.sqrt((n * sum_x_sq - sum_x**2) * (n * sum_y_sq - sum_y**2))
    if denominator == 0:
        return 0
    else:
        return numerator / denominator

# Define a function to predict the rating for a worker based on a customer's ratings and the Pearson correlation coefficients
def predict_rating(worker_ratings, correlation_coefficients, customer_ratings):
    numerator = 0
    denominator = 0
    for i in range(len(worker_ratings)):
        if worker_ratings[i] != 0 and customer_ratings[i] != 0:
            numerator += correlation_coefficients[i] * (customer_ratings[i] - np.mean(customer_ratings))
            denominator += np.abs(correlation_coefficients[i])
    if denominator == 0:
        return np.mean(worker_ratings)
    elif np.sum(correlation_coefficients) == 0:
        return 0
    else:
        # Calculate the weights
        weights = [correlation_coefficients[i] for i in range(len(worker_ratings)) if worker_ratings[i] != 0 and customer_ratings[i] != 0]
        # Calculate the ratings
        ratings = [worker_ratings[i] for i in range(len(worker_ratings)) if worker_ratings[i] != 0 and customer_ratings[i] != 0]
        # Check if the weights are all zeros
        if np.sum(weights) == 0:
            # If all weights are zero, use unweighted mean
            predicted_rating = np.mean(ratings)
        else:
            # If some weights are non-zero, use weighted mean
            predicted_rating = np.average(ratings, weights=weights)
        return np.mean(worker_ratings) + (numerator / denominator)


# Define a function to calculate the mean absolute error (MAE) between the predicted ratings and the actual ratings
def calculate_mae(predictions, actual_ratings):
    n = len(predictions)
    if n == 0:
        return 0.0
    error = sum([abs(predictions[i] - actual_ratings[i]) for i in range(n)])
    return error / n


# Calculate the Pearson correlation coefficients between each pair of customers and workers
num_customers = len(df['Customer ID'].unique())
num_workers = len(df['Worker ID'].unique())
correlation_matrix = np.zeros((num_customers, num_workers))
for i in range(num_customers):
    for j in range(num_workers):
        customer_ratings = []
        worker_ratings = []
        for index, row in train_data.iterrows():
            if row['Customer ID'] == i and row['Worker ID'] == j:
                customer_ratings.append(row['Rating'])
                if row['Rating'] != 0:
                    worker_ratings.append(row['Rating'])
        if len(customer_ratings) == len(worker_ratings):
            correlation_matrix[i, j] = pearson_correlation(customer_ratings, worker_ratings)


# Make predictions on the test set using the correlation matrix
predicted_ratings = []
actual_ratings = []
for i in range(test_data.shape[0]):
    customer_id = test_data.iloc[i]['Customer ID']
    worker_id = test_data.iloc[i]['Worker ID']
    actual_rating = test_data.iloc[i]['Rating']
    # Find the workers with the highest correlations for this customer
    worker_correlations = correlation_matrix[customer_id]
    top_worker_ids = np.argsort(worker_correlations)[::-1][:5]

# Calculate the predicted rating as the weighted average of the top worker ratings
weights = np.array([worker_correlations[id] for id in top_worker_ids])
ratings = []
for id in top_worker_ids:
    worker_ratings = []
    for index, row in train_data.iterrows():
        if row['Customer ID'] == customer_id and row['Worker ID'] == id:
            worker_ratings.append(row['Rating'])
    if len(worker_ratings) > 0:
        ratings.append(sum(worker_ratings) / len(worker_ratings))
ratings = np.array(ratings)
if len(ratings) > 0:
    if np.sum(weights) == 0:
        predicted_rating = np.mean(ratings)
    else:
        predicted_rating = np.average(ratings, weights=weights)
else:
    predicted_rating = 0.0


# Calculate the accuracy of the predictions using the mean absolute error (MAE)
mae = calculate_mae(predicted_ratings, actual_ratings)
print("MAE:", mae)

#Save the correlation matrix to a file
np.savetxt('correlation_matrix.csv', correlation_matrix, delimiter=',')

TypeError: Axis must be specified when shapes of a and weights differ.

In [115]:
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('householddata.csv')

# Pivot the dataset to get the ratings matrix
ratings_matrix = data.pivot_table(index='Customer ID', columns='Worker ID', values='Rating')

# Calculate the correlation matrix
correlation_matrix = ratings_matrix.corr(method='pearson', min_periods=5)
r = sum((x - mean(x)) * (y - mean(y))) / (sqrt(sum((x - mean(x)) ** 2)) * sqrt(sum((y - mean(y)) ** 2)))



KeyError: 'Customer ID'

In [117]:
import pandas as pd
import numpy as np

# Read the data from the CSV file
data = pd.read_csv('householddata.csv')

# Split the data into training and testing sets
train_data = data.sample(frac=0.7, random_state=1)
test_data = data.drop(train_data.index)

# Calculate the Pearson correlation coefficients between customers and workers in the training set
correlation_matrix = train_data.pivot_table(index='Customer Username', columns='Worker Username', values='Rating').corr()

# Make predictions on the test set using the correlation matrix
predicted_ratings = []
actual_ratings = []
for i in range(test_data.shape[0]):
    customer_id = test_data.iloc[i]['Customer Username']
    worker_id = test_data.iloc[i]['Worker Username']
    actual_rating = test_data.iloc[i]['Rating']
    # Find the workers with the highest correlations for this customer
    worker_correlations = correlation_matrix[customer_id]
    top_worker_ids = np.argsort(worker_correlations)[::-1][:5]

    # Calculate the predicted rating as the weighted average of the top worker ratings
    weights = [worker_correlations[id] for id in top_worker_ids]
    ratings = []
    for id in top_worker_ids:
        worker_ratings = []
        for index, row in train_data.iterrows():
            if row['Customer Username'] == customer_id and row['Worker Username'] == id:
                worker_ratings.append(row['Rating'])
        if len(worker_ratings) > 0:
            ratings.append(sum(worker_ratings) / len(worker_ratings))
    if len(ratings) > 0:
        if np.sum(weights) == 0:
            predicted_rating = np.mean(ratings)
        else:
            predicted_rating = np.average(ratings, weights=weights)
    else:
        predicted_rating = 0.0
    predicted_ratings.append(predicted_rating)
    actual_ratings.append(actual_rating)

# Calculate the accuracy of the predictions using the mean absolute error (MAE)
mae = np.mean(np.abs(np.array(predicted_ratings) - np.array(actual_ratings)))
print("MAE:", mae)

KeyError: 'jessica33'

In [127]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression

# Load the dataset
data = pd.read_csv('householddata.csv')

# Preprocess the data
data = data.dropna()
data = pd.get_dummies(data)

# Split the data
X = data.drop(['Rating'], axis=1)
y = data['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate the Pearson correlation coefficient
correlations = []
for feature in X_train.columns:
    correlation, _ = pearsonr(X_train[feature], y_train)
    correlations.append((feature, correlation))
correlations.sort(key=lambda x: abs(x[1]), reverse=True)

# Select the features
num_features = 5
selected_features = [correlation[0] for correlation in correlations[:num_features]]

# Train the model
model = LinearRegression()
model.fit(X_train[selected_features], y_train)

# Evaluate the model
score = model.score(X_test[selected_features], y_test)
print(f'R-squared score: {score}')

# Make predictions
new_data = pd.read_csv('householddata.csv')
new_data = pd.get_dummies(new_data)
predictions = model.predict(new_data[selected_features])




R-squared score: -0.21572768883569737


In [128]:
import pickle

# Save the trained model
with open('pearson_correlation_model.pkl', 'wb') as file:
    pickle.dump(pearson_corr_model, file)


NameError: name 'pearson_corr_model' is not defined

In [129]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import pairwise_distances
import pickle

# Load the dataset
df = pd.read_csv("householddata.csv")

# Pivot the data to create a matrix of worker ratings
ratings_df = df.pivot_table(index="Worker Username", columns="Customer Username", values="Rating")

# Replace missing ratings with 0
ratings_df.fillna(0, inplace=True)

# Calculate the Pearson Correlation Coefficient
pearson_corr = 1 - pairwise_distances(ratings_df, metric="correlation")

# Save the Pearson Correlation model to a pickle file
with open("pearson_model.pkl", "wb") as f:
    pickle.dump(pearson_corr, f)


In [131]:
import csv
from math import sqrt

# Load the data from the CSV file
data = []
with open("householddata.csv", newline="") as file:
    reader = csv.reader(file)
    header = next(reader)
    for row in reader:
        data.append(row)

# Compute the average rating for each worker
worker_ratings = {}
for row in data:
    worker = row[1]
    rating = int(row[8]) if row[8] != '' else 0
    if worker not in worker_ratings:
        worker_ratings[worker] = []
    worker_ratings[worker].append(rating)
worker_averages = {}
for worker, ratings in worker_ratings.items():
    worker_averages[worker] = sum(ratings) / len(ratings)

# Compute the Pearson correlation coefficient between each pair of workers
correlations = {}
for i in range(len(worker_averages)):
    for j in range(i + 1, len(worker_averages)):
        worker1 = list(worker_averages.keys())[i]
        worker2 = list(worker_averages.keys())[j]
        ratings1 = worker_ratings[worker1]
        ratings2 = worker_ratings[worker2]
        avg1 = worker_averages[worker1]
        avg2 = worker_averages[worker2]
        numerator = 0
        denom1 = 0
        denom2 = 0
        for k in range(len(ratings1)):
            numerator += (ratings1[k] - avg1) * (ratings2[k] - avg2)
            denom1 += (ratings1[k] - avg1) ** 2
            denom2 += (ratings2[k] - avg2) ** 2
        denominator = sqrt(denom1) * sqrt(denom2)
        if denominator != 0:
            correlation = numerator / denominator
            correlations[(worker1, worker2)] = correlation

# Save the correlations to a file
with open("worker_correlations.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Worker 1", "Worker 2", "Correlation"])
    for (worker1, worker2), correlation in correlations.items():
        writer.writerow([worker1, worker2, correlation])
