In [None]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker
fake = Faker()
# Load the data
df = pd.read_csv('fake_data.csv')

def generate_random_data(num_records):
    random_data = []
    for _ in range(num_records):
        order_id = "LK" + ''.join(random.choices('0123456789', k=7))
        email = fake.email()
        sales = random.choice([0, random.randint(100, 1000)])
        date = fake.date_time_this_year().strftime("%Y-%m-%d %H:%M:%S %z")
        product_quantity = 1
        product_name = fake.sentence(nb_words=6)  # Random product name
        product_sku = ''.join(random.choices('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ', k=10))
        customer_name = fake.name()
        customer_city = fake.city()
        customer_zip = fake.zipcode()
        customer_phone = ''.join(random.choices('0123456789', k=10))

        random_data.append({
            "Order id": order_id,
            "Email": email,
            "Sales": sales,
            "Date": date,
            "Product quantity": product_quantity,
            "Product name": product_name,
            "Product sku": product_sku,
            "Customer Name": customer_name,
            "Customer City": customer_city,
            "Customer Zip": customer_zip,
            "Customer Phone": customer_phone
        })

    return pd.DataFrame(random_data)

# Generate 10 random records
random_df = generate_random_data(100000)

# Combine the sample data with the random data
combined_df = pd.concat([df, random_df], ignore_index=True)

# Save to CSV
combined_df.to_csv("combined_data.csv", index=False)

print(combined_df)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the data
df = pd.read_csv('fake_data.csv')

# Prepare the data
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
df['Month'] = df.index.month
df['Year'] = df.index.year

# Feature and target variable
X = df[['Month', 'Year']]
y = df['Sales']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2, y_pred

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=0),
    'Random Forest': RandomForestRegressor(random_state=0),
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', random_state=0)
}

# Evaluate each model
results = {}
for name, model in models.items():
    mse, r2, y_pred = evaluate_model(model, X_train, X_test, y_train, y_test)
    results[name] = {'MSE': mse, 'R-squared': r2, 'Predictions': y_pred}

# Print results
for name, result in results.items():
    print(f"{name} - MSE: {result['MSE']:.4f}, R-squared: {result['R-squared']:.4f}")

# Choose the best model based on R-squared
best_model_name = max(results, key=lambda k: results[k]['R-squared'])
best_model = models[best_model_name]

# Compare actual and predicted sales
y_pred_best = results[best_model_name]['Predictions']
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_best})
print(comparison_df)

# Ensure indices match
predicted_df = pd.DataFrame(y_pred_best, index=X_test.index, columns=['Predicted_Sales'])
df = df.join(predicted_df)

# Predict next month's sales based on the last billing date for each customer
last_billing_dates = df.reset_index().groupby(['Email', 'Customer Name'])['Date'].max()
predictions = []

for (email, customer_name), last_date in last_billing_dates.items():
    # Get the sales for the last billing date
    last_sales = df.loc[df.index == last_date, 'Sales'].values[0]

    next_month = (last_date.month % 12) + 1
    next_year = last_date.year if next_month > last_date.month else last_date.year + 1
    next_month_features = np.array([[next_month, next_year]])
    predicted_sales = best_model.predict(next_month_features)

    predictions.append((email, customer_name, last_date, last_sales, next_month, next_year, predicted_sales[0]))

predictions_df = pd.DataFrame(predictions, columns=['Email', 'Customer Name', 'Last_Billing_Date', 'Last_Sales', 'Next_Month', 'Next_Year', 'Predicted_Sales'])

# Save the predictions to a CSV file
predictions_df.to_csv('customer_next_month_predictions.csv', index=False)

print(f"Best model: {best_model_name}")
print(predictions_df.head())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Set the style for seaborn plots
sns.set(style='whitegrid')

# 1. Product Popularity Analysis
plt.figure(figsize=(10, 10))
top_products = df['Product name'].value_counts().head(10)
sns.barplot(x=top_products.index, y=top_products.values, palette='viridis')
plt.title('Top 10 Most Popular Products', fontsize=16)
plt.xlabel('Product Name', fontsize=14)
plt.ylabel('Number of Orders', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# 2. Customer Segmentation by Location
plt.figure(figsize=(10, 6))
top_cities = df['Customer City'].value_counts().head(10)
sns.barplot(x=top_cities.index, y=top_cities.values, palette='plasma')
plt.title('Top 10 Cities by Number of Orders', fontsize=16)
plt.xlabel('City', fontsize=14)
plt.ylabel('Number of Orders', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()



# 4. Price Point Analysis
df['Sales'] = pd.to_numeric(df['Sales'], errors='coerce')
price_ranges = pd.cut(df['Sales'], bins=[0, 1000, 2000, 3000, 4000, float('inf')],
                      labels=['0-1000', '1001-2000', '2001-3000', '3001-4000', '4000+'])
price_range_counts = price_ranges.value_counts()
plt.figure(figsize=(10, 6))
sns.barplot(x=price_range_counts.index, y=price_range_counts.values, palette='crest')
plt.title('Distribution of Sales by Price Range', fontsize=16)
plt.xlabel('Price Range', fontsize=14)
plt.ylabel('Number of Orders', fontsize=14)
plt.show()

# 5. Time-based Analysis
# df['Date'] = pd.to_datetime(df['Date'])
# orders_by_hour = df['Date'].dt.hour.value_counts().sort_index()
# plt.figure(figsize=(14, 7))
# sns.lineplot(x=orders_by_hour.index, y=orders_by_hour.values, marker='o', color='royalblue')
# plt.title('Number of Orders by Hour of Day', fontsize=16)
# plt.xlabel('Hour of Day', fontsize=14)
# plt.ylabel('Number of Orders', fontsize=14)
# plt.grid(True)
# plt.show()

# 6. Color Preference Analysis
def extract_color(product_name):
    colors = ['Blue', 'Red', 'Pink', 'Green', 'Grey', 'White', 'Ivory', 'Navy', 'Rose', 'Lavender', 'Mustard']
    for color in colors:
        if color.lower() in product_name.lower():
            return color
    return 'Other'

df['Color'] = df['Product name'].apply(extract_color)
color_counts = df['Color'].value_counts()
plt.figure(figsize=(10, 6))
sns.barplot(x=color_counts.index, y=color_counts.values, palette='pastel')
plt.title('Color Preferences in Products', fontsize=16)
plt.xlabel('Color', fontsize=14)
plt.ylabel('Number of Orders', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.show()


# 3. Size Analysis
plt.figure(figsize=(8, 6))
size_counts = df['Product name'].str.extract(r'(\d+X|[XLS])').value_counts()
size_counts.plot(kind='pie', autopct='%1.1f%%', colors=sns.color_palette('Set2'))
plt.title('Distribution of Product Sizes', fontsize=12)
plt.ylabel('')
plt.show()



# 7. Customer Value Analysis
plt.figure(figsize=(10, 6))
customer_value = df.groupby('Customer Name')['Sales'].sum().sort_values(ascending=False).head(10)
sns.barplot(x=customer_value.index, y=customer_value.values, palette='coolwarm')
plt.title('Top 10 Customers by Total Sales Value', fontsize=16)
plt.xlabel('Customer Name', fontsize=14)
plt.ylabel('Total Sales Value', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Print some summary statistics
print("Total number of orders:", len(df))
print("Total sales value:", df['Sales'].sum())
print("Average order value:", df['Sales'].mean())
print("Number of unique products:", df['Product name'].nunique())
print("Number of unique customers:", df['Customer Name'].nunique())

In [None]:
import matplotlib.pyplot as plt

# Plot predictions vs actual values for the best model
plt.figure(figsize=(12, 6))
plt.scatter(y_test, best_model.predict(X_test), alpha=0.5)
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')
plt.title('Actual vs Predicted Sales')
plt.grid(True)
plt.show()
