In [15]:
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame

In [16]:
spark = SparkSession.builder.appName("final_analysis").getOrCreate()

In [19]:
df = spark.read.parquet("../data/predicted_datav0.parquet")
df.show(5)

+--------------+----------+---------+------------------+------------------+--------+--------------------+------------+-----------+---------+-----------+
|jira_ticket_id|      date|completed|num_slack_messages|         num_hours|engineer|  ticket_description|  initiative|new_revenue|repo_name|lines_added|
+--------------+----------+---------+------------------+------------------+--------+--------------------+------------+-----------+---------+-----------+
|             1|2023-03-31|     true|             276.0| 62.13453674316406|    Dale|Taurus both absen...|  Efficiency|  5295.6553|        G|         42|
|            12|2023-12-30|     true|             436.0| 61.72333526611328|   Daisy|Toronto funereal ...|  Efficiency|  3425.6223|        Q|         73|
|            13|2023-03-24|    false|             294.0|  97.2278060913086| Unknown|sizzle animism ed...|  Efficiency|   2806.764|        R|         22|
|            22|2023-06-22|    false|             366.0| 45.48678970336914|    Jos

### Questions to be answered:
- Q1: What is the longest Jira ticket description?
- Q2: Which repo has the most lines of code added?
- Q3: Provide the maximum number of Slack messages in any ticket for each engineer
- Q4: Mean hours spent on a ticket in June 2023
- Q5: Total lines of code contributed by completed tickets to the repo 'A'
- Q6: Total new revenue per engineer per company initiative

In [26]:
def longest_ticket_description(df: DataFrame) -> str:
    description = df.orderBy(F.length("ticket_description").desc()).select("ticket_description").limit(1).collect()[0][0]
    return description

# Get the result
result_q1 = longest_ticket_description(df)
result_q1

'Carmela sibling musicology patron gunpoint Canfield mammal Santayana Freddie Waterhouse estuary eligible Todd bashaw repellent Jovanovich integrity windbreak halide pestilential italic desiccate Hanoverian Riordan Lathrop connotative ratify Chattanooga phenol enjoinder chase breakdown alkaloid homology cleric consistent pickle rather Barney dogma crocodile liaison endometrial embroider methodology within marketeer cope patrol paycheck Nevins Spokane theorem Jorgensen Wilma transoceanic Mansfield arboretum attribution chemistry Woodbury Cottrell prosodic lox fallacious tachyon coprocessor Furman putdown Pickford goose ignition icosahedral chemic reconnaissance aggravate marinade furthest converge Apocrypha formula cocky landlocked Hopkins stamp Bennington injudicious bulletin spontaneous whalebone prolific scavenge aliphatic balsam offprint shepherdess underling Banbury Rebecca flush'

In [21]:
def repo_with_most_lines(df: DataFrame) -> DataFrame:
    return df.groupBy("repo_name").agg(F.sum("lines_added").alias("total_lines")).orderBy(F.desc("total_lines")).limit(1).select("repo_name")

# Get the result
result_q2 = repo_with_most_lines(df)
result_q2.show()

+---------+
|repo_name|
+---------+
|        R|
+---------+



In [22]:
def max_slack_messages_per_engineer(df: DataFrame) -> DataFrame:
    return df.groupBy("engineer").agg(F.max("num_slack_messages").alias("max_messages")).orderBy("engineer")

# Get the result
result_q3 = max_slack_messages_per_engineer(df)
result_q3.show()

+--------+------------+
|engineer|max_messages|
+--------+------------+
|    Alex|       500.0|
|   Daisy|       500.0|
|    Dale|       500.0|
|    Josh|       500.0|
|  Sandra|       500.0|
| Unknown|       500.0|
|    alex|       500.0|
|   daisy|       500.0|
|    dale|       500.0|
|    josh|       500.0|
|  sandra|       500.0|
+--------+------------+



In [23]:
def mean_hours_june_2023(df: DataFrame) -> DataFrame:
    return df.filter((F.year("date") == 2023) & (F.month("date") == 6)).agg(F.mean("num_hours").alias("mean_hours"))

# Get the result
result_q4 = mean_hours_june_2023(df)
result_q4.show()

+-----------------+
|       mean_hours|
+-----------------+
|50.20965416646661|
+-----------------+



In [24]:
def total_lines_completed_repo_a(df: DataFrame) -> DataFrame:
    return df.filter((F.col("completed") == True) & (F.col("repo_name") == "A")).agg(F.sum("lines_added").alias("total_lines"))

# Get the result
result_q5 = total_lines_completed_repo_a(df)
result_q5.show()

+-----------+
|total_lines|
+-----------+
|      85208|
+-----------+



In [25]:
def total_revenue_per_engineer_initiative(df: DataFrame) -> DataFrame:
    return df.groupBy("engineer", "initiative").agg(F.sum("new_revenue").alias("total_revenue")).orderBy("engineer", "initiative")

# Get the result
result_q6 = total_revenue_per_engineer_initiative(df)
result_q6.show()

+--------+------------+--------------------+
|engineer|  initiative|       total_revenue|
+--------+------------+--------------------+
|    Alex|  Efficiency|2.6966255779155493E7|
|    Alex|New Customer| 2.647524867811036E7|
|    Alex|     Support|2.5910800018243313E7|
|   Daisy|  Efficiency| 2.681017515390265E7|
|   Daisy|New Customer|2.6610312955403924E7|
|   Daisy|     Support|2.6257335872519135E7|
|    Dale|  Efficiency|  2.68572271843884E7|
|    Dale|New Customer|2.6060978011313647E7|
|    Dale|     Support|2.6448138232634544E7|
|    Josh|  Efficiency| 2.611526335034299E7|
|    Josh|New Customer|2.6681857968283057E7|
|    Josh|     Support|2.6748859962927252E7|
|  Sandra|  Efficiency|2.5834171813203394E7|
|  Sandra|New Customer|2.7332515504475117E7|
|  Sandra|     Support|2.6311226631829336E7|
| Unknown|  Efficiency|2.7215193001622915E7|
| Unknown|New Customer|2.6835177124576285E7|
| Unknown|     Support| 2.723762447342052E7|
|    alex|  Efficiency|  1341800.4322223663|
|    alex|