### bb

In [None]:
!pip install sweetviz

Collecting sweetviz
  Downloading sweetviz-2.3.1-py3-none-any.whl.metadata (24 kB)
Downloading sweetviz-2.3.1-py3-none-any.whl (15.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sweetviz
Successfully installed sweetviz-2.3.1


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import binom, norm, chi2_contingency
import sweetviz as sv
from scipy.stats import ttest_ind

# Read in data from CSV file
file_path = "C:\\Users\\grays\\OneDrive\\Trip Report_Fall Project Cleaned.csv"
data = pd.read_csv(file_path, encoding='latin1')  # Ensure proper encoding

# Data Preview and Column Descriptions
print("Data Preview:")
print(data.head())

print("\nColumn Descriptions:")
column_descriptions = {
    "DATE": "Date of the service request",
    "DAY": "Day of the week",
    "MONTH": "Month of the year",
    "YEAR": "Year of the service request",
    "DISTRICT": "Hauling District where the service is performed",
    "MAS": "Master Account Number",
    "CUSTOMER": "Customer Name",
    "TICKET": "Ticket number for the service request",
    "LODTYP": "Highlights TRIPPED haul 'TRP'",
    "SVCHRG": "Service charge for the request",
    "REQUESTED": "Dispatcher Name who opened the ticket",
    "ENTRY": "Dispatcher Name who assigned the ticket",
    "CLOSED": "Dispatcher Name who closed the ticket",
    "DRIVER": "Driver ID assigned to the service request",
    "AM": "Account Manager",
}
for column, description in column_descriptions.items():
    print(f"{column}: {description}")

# Data Profiling
# List comprehension to square numbers
squared_numbers = [x**2 for x in range(10)]
print("Squared numbers:", squared_numbers)

# Lambda function to add two numbers
add = lambda x, y: x + y
print("Sum of 5 and 3:", add(5, 3))

# Data Quality
# Remove duplicate rows
data_no_duplicates = data.drop_duplicates()
print("Data without duplicates:\n", data_no_duplicates.head())

# Fill missing values using forward fill
data_filled = data.fillna(method='ffill')
print("Data with missing values filled:\n", data_filled.head())

# Traditional Data Profiling
# Print data types for each column
print("Column data types:\n", data.dtypes)

# Calculate number of unique values for each column
unique_values = {col: data[col].nunique() for col in data.columns}
print("Unique values per column:\n", unique_values)

# Additional EDA
# Generate EDA report using Sweetviz
sweet_report = sv.analyze(data)
sweet_report.show_html('EDA_Report.html')

# Fill missing values
data_filled = data.ffill()

# Exclude non-numeric columns for correlation calculation
numeric_data = data_filled.select_dtypes(include=[np.number])

# Calculate correlation matrix
correlation_matrix = numeric_data.corr()
print(correlation_matrix)

# Generate pairplot
sns.pairplot(data)
plt.show()

# Distributions
# Generate random data from normal distribution
data_sample = norm.rvs(5, 2, 1000)
plt.hist(data_sample)
plt.show()

# Fit normal distribution to data
mu, std = norm.fit(data_sample)
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=2)
plt.title("Fit Results: mu = %.2f, std = %.2f" % (mu, std))

# PMF AND CDF
# Define binomial distribution parameters
n, p = 10, 0.5
x = np.arange(n+1)
pmf = binom.pmf(x, n, p)
plt.bar(x, pmf)
plt.show()

cdf = binom.cdf(x, n, p)
plt.plot(x, cdf, marker='o', linestyle='--')
plt.show()

# PDF
# Define normal distribution parameters
mu, sigma = 0, 1
x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
pdf = norm.pdf(x, mu, sigma)
plt.plot(x, pdf)
plt.show()

# Chi-Squared Test
top_n = 5
top_drivers = data['DRIVER'].value_counts().head(top_n).index.tolist()
top_customers = data['CUSTOMER'].value_counts().head(top_n).index.tolist()
filtered_data = data[data['DRIVER'].isin(top_drivers) & data['CUSTOMER'].isin(top_customers)]
contingency_table = pd.crosstab(filtered_data['DRIVER'], filtered_data['CUSTOMER'])
chi2, p, _, _ = chi2_contingency(contingency_table)
print(f"Chi-Squared Value: {chi2}")
print(f"P-Value: {p}")
if p < 0.05:
 print("Reject the null hypothesis. There is a significant association between top drivers and top customers.")
else:
 print("Fail to reject the null hypothesis. No significant association observed between top drivers and top customers.")

import matplotlib.pyplot as plt

# Sales Reps and Customers Analysis
top_n_rep = 5
top_n_customers = 20
top_reps = data['AM'].value_counts().head(top_n_rep).index.tolist()
top_customers = data['CUSTOMER'].value_counts().head(top_n_customers).index.tolist()
rep_customer_counts = data[data['AM'].isin(top_reps)].groupby('AM')['CUSTOMER'].nunique()
rep_top_customer_counts = data[(data['AM'].isin(top_reps)) & (data['CUSTOMER'].isin(top_customers))].groupby('AM')['CUSTOMER'].nunique()
summary = pd.DataFrame({ 'Total_Unique_Customers': rep_customer_counts, 'Unique_Top_20_Customers': rep_top_customer_counts })
print(summary)

for rep in top_reps:
    sub_data = data[data['AM'] == rep]
    print(f"Statistics for {rep}:")
    print("Mean:", sub_data['CUSTOMER'].value_counts().mean())
    mode_value = sub_data['CUSTOMER'].value_counts().idxmax()
    mode_count = sub_data['CUSTOMER'].value_counts().max()
    print("Mode:", mode_value, "with count of", mode_count)
    print("Spread (Standard Deviation):", sub_data['CUSTOMER'].value_counts().std())
    print("Tails (Kurtosis):", sub_data['CUSTOMER'].value_counts().kurtosis())
    plt.hist(sub_data['CUSTOMER'].value_counts(), bins=10, alpha=0.7, label=rep)
    plt.xlabel("Number of Occurrences")
    plt.ylabel("Number of Customers")
    plt.title("Distribution of Customer Occurrences by Top Sales Reps")
    plt.legend()
    plt.show()


# Save the EDA report to an HTML file
sweet_report = sv.analyze(data)
sweet_report.show_html('EDA_Report.html')

### Other

In [None]:
import pandas as pd
file_path = 'w1_data.csv'
data = pd.read_csv(file_path)

data.head()

Unnamed: 0,Шал,Тагт,Ашиглалтанд орсон он,Гараж,Цонх,Барилгын давхар,Хаалга,Талбай,Хэдэн давхарт,Лизинг,...,Байршил,Зарын дугаар:,Огноо,Зарын гарчиг,Үнэ,Байрлал,Хэрэглэгчийн нэр,Хэрэглэгчийн дугаар,Үзсэн,Зарын тайлбар
0,Паркет,Тагтгүй,2020,Байхгүй,Вакум,16,Бүргэд,24.04 м²,4,Лизинггүй,...,16-р хороолол,4580385,Нийтэлсэн: 2020-12-15 09:55,Бзд цайз захын баруун талд шинэ 1 өрөө байр,43000000.0,Улаанбаатар,,,639,1
1,Паркет,1 тагттай,2009,Байхгүй,Вакум,17,Бүргэд,54.0 м²,8,Банкны лизингтэй,...,13-р хороолол,4580104,Нийтэлсэн: 2020-12-15 08:08,Наран туул захын хажууд 1 өрөө,76000000.0,Улаанбаатар,,,202,8
2,Паркет,Тагтгүй,2009,Байхгүй,Вакум,5,Бүргэд,38.6 м²,5,Лизинггүй,...,Viva city,4608457,Нийтэлсэн: 2021-01-02 10:40,Vivad mansarttai 1 uruu 38.6m2,66000000.0,Улаанбаатар,,,397,1
3,Паркет,1 тагттай,2018,Байхгүй,Вакум,6,Бүргэд,31.0 м²,1,Лизинггүй,...,Найрамдал,4577481,Нийтэлсэн: 2021-01-02 13:01,Найрамдал зуслан дотор 1 өрөө байр,72000000.0,Улаанбаатар,,,165,1
4,Паркет,1 тагттай,2015,Байхгүй,Вакум,8,Бүргэд,38.0 м²,8,Лизинггүй,...,16-р хороолол,4586874,Нийтэлсэн: 2020-12-18 10:31,1 өрөө байр,58000000.0,Улаанбаатар,,,278,1


In [None]:
# Define a translation dictionary for the columns
translation_dict = {
    "Шал": "Floor Type",
    "Тагт": "Balcony",
    "Ашиглалтанд орсон он": "Year Built",
    "Гараж": "Garage",
    "Цонх": "Window",
    "Барилгын давхар": "Building Floors",
    "Хаалга": "Door",
    "Талбай": "Area",
    "Хэдэн давхарт": "Floor Number",
    "Лизинг": "Mortgage",
    "Байршил": "Location",
    "Зарын дугаар:": "Ad Number",
    "Огноо": "Date",
    "Зарын гарчиг": "Ad Title",
    "Үнэ": "Price",
    "Байрлал": "City",
    "Хэрэглэгчийн нэр": "User Name",
    "Хэрэглэгчийн дугаар": "User Number",
    "Үзсэн": "Views",
    "Зарын тайлбар": "Ad Description"
}

# Translate column names
translated_data = data.rename(columns=translation_dict)

# Save the translated data to a new CSV file
translated_file_path = 'translated_data.csv'
translated_data.to_csv(translated_file_path, index=False)

translated_file_path


'translated_data.csv'

In [None]:
data_cldataaaaaaaaaaaaaaadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27861 entries, 0 to 27860
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Шал                   27861 non-null  object
 1   Тагт                  27861 non-null  object
 2   Ашиглалтанд орсон он  27861 non-null  object
 3   Гараж                 27861 non-null  object
 4   Цонх                  27861 non-null  object
 5   Барилгын давхар       27861 non-null  object
 6   Хаалга                27861 non-null  object
 7   Талбай                27861 non-null  object
 8   Хэдэн давхарт         27861 non-null  object
 9   Лизинг                27861 non-null  object
 10  Дүүрэг                27861 non-null  object
 11  Цонхны тоо            27861 non-null  object
 12  Байршил               27861 non-null  object
 13  Зарын дугаар:         27861 non-null  object
 14  Огноо                 27861 non-null  object
 15  Зарын гарчиг          27861 non-null

In [None]:
# Cleaning the dataset: Remove commas and convert numeric columns to integers
data_cleaned = data.copy()

# Convert year columns to numeric after removing commas
for col in data_cleaned.columns[1:]:
    data_cleaned[col] = data_cleaned[col].str.replace(',', '').astype(float)

# Check for missing values
missing_values = data_cleaned.isnull().sum()

# Display cleaned dataset and missing values summary
data_cleaned.info(), missing_values


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 35 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Аймаг   27 non-null     object 
 1   1990    26 non-null     float64
 2   1991    27 non-null     float64
 3   1992    27 non-null     float64
 4   1993    27 non-null     float64
 5   1994    27 non-null     float64
 6   1995    27 non-null     float64
 7   1996    27 non-null     float64
 8   1997    27 non-null     float64
 9   1998    27 non-null     float64
 10  1999    27 non-null     float64
 11  2000    27 non-null     float64
 12  2001    27 non-null     float64
 13  2002    27 non-null     float64
 14  2003    27 non-null     float64
 15  2004    27 non-null     float64
 16  2005    27 non-null     float64
 17  2006    27 non-null     float64
 18  2007    27 non-null     float64
 19  2008    27 non-null     float64
 20  2009    27 non-null     float64
 21  2010    27 non-null     float64
 22  2011

(None,
 Аймаг    1
 1990     2
 1991     1
 1992     1
 1993     1
 1994     1
 1995     1
 1996     1
 1997     1
 1998     1
 1999     1
 2000     1
 2001     1
 2002     1
 2003     1
 2004     1
 2005     1
 2006     1
 2007     1
 2008     1
 2009     1
 2010     1
 2011     1
 2012     1
 2013     1
 2014     1
 2015     1
 2016     1
 2017     1
 2018     1
 2019     1
 2020     1
 2021     1
 2022     1
 2023     1
 dtype: int64)

In [None]:
# Handling missing values: Use forward fill for simplicity
data_cleaned.fillna(method='ffill', inplace=True)

# Verify there are no missing values left
missing_values_after_cleaning = data_cleaned.isnull().sum()

# Generate descriptive statistics for the numerical data
descriptive_stats = data_cleaned.describe()

missing_values_after_cleaning, descriptive_stats


  data_cleaned.fillna(method='ffill', inplace=True)


(Аймаг    0
 1990     0
 1991     0
 1992     0
 1993     0
 1994     0
 1995     0
 1996     0
 1997     0
 1998     0
 1999     0
 2000     0
 2001     0
 2002     0
 2003     0
 2004     0
 2005     0
 2006     0
 2007     0
 2008     0
 2009     0
 2010     0
 2011     0
 2012     0
 2013     0
 2014     0
 2015     0
 2016     0
 2017     0
 2018     0
 2019     0
 2020     0
 2021     0
 2022     0
 2023     0
 dtype: int64,
                1990          1991          1992          1993          1994  \
 count     28.000000     28.000000     28.000000     28.000000     28.000000   
 mean    2222.892857   2639.357143   2976.214286   3318.535714   3427.607143   
 std     3656.795595   4741.558653   5388.214398   5998.174485   6215.660939   
 min       82.000000     77.000000     78.000000    148.000000    141.000000   
 25%      625.250000    644.750000    696.500000    903.250000    717.000000   
 50%      966.000000   1103.500000   1113.500000   1217.000000   1313.000000   
 75% 

In [None]:
import matplotlib.pyplot as plt

# Plot the overall trend for "Country Sum"
country_sum_data = data_cleaned[data_cleaned["Aimag"] == "Country Sum"].iloc[0, 1:]

plt.figure(figsize=(12, 6))
plt.plot(country_sum_data.index, country_sum_data.values, marker='o', label='Country Sum')
plt.title("Trend Over Time for Country Sum")
plt.xlabel("Year")
plt.ylabel("Value")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.show()

# Plot trends for selected regions
selected_regions = ["Western region", "Bayan-Ulgii", "Zavkhan"]
plt.figure(figsize=(12, 6))

for region in selected_regions:
    region_data = data_cleaned[data_cleaned["Aimag"] == region].iloc[0, 1:]
    plt.plot(region_data.index, region_data.values, marker='o', label=region)

plt.title("Trends Over Time for Selected Regions")
plt.xlabel("Year")
plt.ylabel("Value")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.show()


In [None]:
# Calculate growth rate for each region
growth_rates = {}

for idx, row in data_cleaned.iterrows():
    region = row["Aimag"]
    values = row.iloc[1:].values  # Exclude the 'Aimag' column
    growth_rate = (values[-1] - values[0]) / values[0] if values[0] != 0 else None
    growth_rates[region] = growth_rate

# Sort regions by growth rate in descending order
sorted_growth = sorted(growth_rates.items(), key=lambda x: x[1], reverse=True)

# Convert to DataFrame for better readability
growth_df = pd.DataFrame(sorted_growth, columns=["Region", "Growth Rate"]).dropna()

# Instead of using the unavailable 'ace_tools', display the DataFrame using pandas' display function
# This provides a similar functionality of displaying the DataFrame to the user
display(growth_df.head(10)) # Displaying the top 10 regions for better readability
# growth_df.head() # You can still use this to display the top 5 rows if preferred

# You can also print the DataFrame to the console:
print(growth_df)

In [None]:
# Select the top 5 regions with the highest growth trends for visualization
top_growth_regions = growth_df["Region"].head(5).values

# Plot growth trends for the top regions
plt.figure(figsize=(12, 6))
for region in top_growth_regions:
    region_data = data_cleaned[data_cleaned["Aimag"] == region].iloc[0, 1:]
    plt.plot(region_data.index, region_data.values, marker='o', label=region)

plt.title("Growth Trends for Regions with Highest Growth")
plt.xlabel("Year")
plt.ylabel("Value")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.show()


In [None]:
# Extract "Country Sum" data
country_sum = data_cleaned[data_cleaned["Aimag"] == "Country Sum"].iloc[0, 1:]

# Calculate regional contributions as a percentage of "Country Sum"
regional_contributions = data_cleaned.drop(index=data_cleaned[data_cleaned["Aimag"] == "Country Sum"].index)
percentage_contributions = regional_contributions.copy()

for col in country_sum.index:
    percentage_contributions[col] = (regional_contributions[col] / country_sum[col]) * 100

# Plot contributions for selected regions
selected_regions = ["Western region", "Bayan-Ulgii", "Ulaanbaatar", "Darkhan-Uul"]
plt.figure(figsize=(14, 7))

for region in selected_regions:
    region_data = percentage_contributions[percentage_contributions["Aimag"] == region].iloc[0, 1:]
    plt.plot(region_data.index, region_data.values, marker='o', label=region)

plt.title("Regional Contributions to 'Country Sum' Over Time")
plt.xlabel("Year")
plt.ylabel("Percentage Contribution (%)")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.show()


### ml

In [31]:
import pandas as pd

# Load the uploaded CSV file to inspect its contents
file_path = 'w1_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()


Unnamed: 0,Шал,Тагт,Ашиглалтанд орсон он,Гараж,Цонх,Барилгын давхар,Хаалга,Талбай,Хэдэн давхарт,Лизинг,...,Байршил,Зарын дугаар:,Огноо,Зарын гарчиг,Үнэ,Байрлал,Хэрэглэгчийн нэр,Хэрэглэгчийн дугаар,Үзсэн,Зарын тайлбар
0,Паркет,Тагтгүй,2020,Байхгүй,Вакум,16,Бүргэд,24.04 м²,4,Лизинггүй,...,16-р хороолол,4580385,Нийтэлсэн: 2020-12-15 09:55,Бзд цайз захын баруун талд шинэ 1 өрөө байр,43000000.0,Улаанбаатар,,,639,1
1,Паркет,1 тагттай,2009,Байхгүй,Вакум,17,Бүргэд,54.0 м²,8,Банкны лизингтэй,...,13-р хороолол,4580104,Нийтэлсэн: 2020-12-15 08:08,Наран туул захын хажууд 1 өрөө,76000000.0,Улаанбаатар,,,202,8
2,Паркет,Тагтгүй,2009,Байхгүй,Вакум,5,Бүргэд,38.6 м²,5,Лизинггүй,...,Viva city,4608457,Нийтэлсэн: 2021-01-02 10:40,Vivad mansarttai 1 uruu 38.6m2,66000000.0,Улаанбаатар,,,397,1
3,Паркет,1 тагттай,2018,Байхгүй,Вакум,6,Бүргэд,31.0 м²,1,Лизинггүй,...,Найрамдал,4577481,Нийтэлсэн: 2021-01-02 13:01,Найрамдал зуслан дотор 1 өрөө байр,72000000.0,Улаанбаатар,,,165,1
4,Паркет,1 тагттай,2015,Байхгүй,Вакум,8,Бүргэд,38.0 м²,8,Лизинггүй,...,16-р хороолол,4586874,Нийтэлсэн: 2020-12-18 10:31,1 өрөө байр,58000000.0,Улаанбаатар,,,278,1


In [29]:
# Summary statistics of the dataset
summary = data.describe(include='all')

# Removing duplicate rows
data_cleaned = data.drop_duplicates()

# Display the summary and the number of rows after cleaning
summary, data_cleaned.shape


(           Шал       Тагт Ашиглалтанд орсон он    Гараж   Цонх  \
 count    27861      27861                27861    27861  27861   
 unique       7          5                   43        3      5   
 top     Паркет  1 тагттай                 2020  Байхгүй  Вакум   
 freq     26082      22507                 3431    25991  26596   
 
        Барилгын давхар  Хаалга   Талбай Хэдэн давхарт     Лизинг  ... Байршил  \
 count            27861   27861    27861         27861      27861  ...   27861   
 unique              25       6     1130            22          4  ...      75   
 top                 12  Бүргэд  30.0 м²             1  Лизинггүй  ...   Бусад   
 freq              5754   23328     1848          3814      23360  ...    3656   
 
        Зарын дугаар:                   Огноо Зарын гарчиг         Үнэ  \
 count           27861                  27861        27861       27861   
 unique           6482                   7292         5552         772   
 top           1404896  Нийтэ

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27861 entries, 0 to 27860
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Шал                   27861 non-null  object
 1   Тагт                  27861 non-null  object
 2   Ашиглалтанд орсон он  27861 non-null  object
 3   Гараж                 27861 non-null  object
 4   Цонх                  27861 non-null  object
 5   Барилгын давхар       27861 non-null  object
 6   Хаалга                27861 non-null  object
 7   Талбай                27861 non-null  object
 8   Хэдэн давхарт         27861 non-null  object
 9   Лизинг                27861 non-null  object
 10  Дүүрэг                27861 non-null  object
 11  Цонхны тоо            27861 non-null  object
 12  Байршил               27861 non-null  object
 13  Зарын дугаар:         27861 non-null  object
 14  Огноо                 27861 non-null  object
 15  Зарын гарчиг          27861 non-null