In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# file_path to point to your data within your mounted Drive
file_path = "/content/drive/MyDrive/Portfolio/Data_analysis_projects/ApacheKafka_Bug_Analysis/data/raw/kafka_bug_reports_raw.csv"

# Load the data
df = pd.read_csv(file_path)

# Print the count of missing values for each column
print("Missing values before cleaning:")
print(df.isnull().sum())

Missing values before cleaning:
key                  0
summary              0
description        182
status               0
resolution        2155
issue_type           0
priority             0
creator             66
assignee          3310
created_at           0
updated_at           0
resolved_at       2155
labels            6765
components        2886
fix_versions      4290
comments_count       0
dtype: int64


In [3]:
df_cleaned = df.copy()

In [4]:
df_cleaned['description'].fillna('No description provided', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['description'].fillna('No description provided', inplace=True)


In [5]:
df_cleaned['resolution'].fillna('Unresolved', inplace=True)
df_cleaned['creator'].fillna('Anonymous', inplace=True)
df_cleaned['assignee'].fillna('Unassigned', inplace=True)
df_cleaned['labels'].fillna('No Labels', inplace=True)
df_cleaned['components'].fillna('Unspecified Component', inplace=True)
df_cleaned['fix_versions'].fillna('Not Yet Fixed', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['resolution'].fillna('Unresolved', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['creator'].fillna('Anonymous', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

In [6]:
# Create a copy of your DataFrame to work with
df_cleaned = df.copy()

# Corrected code to fill missing values
df_cleaned['description'] = df_cleaned['description'].fillna('No description provided')
df_cleaned['resolution'] = df_cleaned['resolution'].fillna('Unresolved')
df_cleaned['creator'] = df_cleaned['creator'].fillna('Anonymous')
df_cleaned['assignee'] = df_cleaned['assignee'].fillna('Unassigned')
df_cleaned['labels'] = df_cleaned['labels'].fillna('No Labels')
df_cleaned['components'] = df_cleaned['components'].fillna('Unspecified Component')
df_cleaned['fix_versions'] = df_cleaned['fix_versions'].fillna('Not Yet Fixed')

# Print the count of missing values for each column again to confirm
print("Missing values after cleaning:")
print(df_cleaned.isnull().sum())

Missing values after cleaning:
key                  0
summary              0
description          0
status               0
resolution           0
issue_type           0
priority             0
creator              0
assignee             0
created_at           0
updated_at           0
resolved_at       2155
labels               0
components           0
fix_versions         0
comments_count       0
dtype: int64


In [7]:
# Convert date columns to datetime objects
df_cleaned['created_at'] = pd.to_datetime(df_cleaned['created_at'])
df_cleaned['updated_at'] = pd.to_datetime(df_cleaned['updated_at'])
df_cleaned['resolved_at'] = pd.to_datetime(df_cleaned['resolved_at'])

# Calculate the time to resolve (in days) for resolved bugs
df_cleaned['time_to_resolve_days'] = (df_cleaned['resolved_at'] - df_cleaned['created_at']).dt.days

# Display the data types and check for our new column
print(df_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8500 entries, 0 to 8499
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   key                   8500 non-null   object             
 1   summary               8500 non-null   object             
 2   description           8500 non-null   object             
 3   status                8500 non-null   object             
 4   resolution            8500 non-null   object             
 5   issue_type            8500 non-null   object             
 6   priority              8500 non-null   object             
 7   creator               8500 non-null   object             
 8   assignee              8500 non-null   object             
 9   created_at            8500 non-null   datetime64[ns, UTC]
 10  updated_at            8500 non-null   datetime64[ns, UTC]
 11  resolved_at           6345 non-null   datetime64[ns, UTC]
 12  labels

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

# Download necessary NLTK data (only needs to be run once per session)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Create a function to clean the text
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text) # Remove non-alphabetic characters
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    lemmas = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1]
    return " ".join(lemmas)

# Apply the preprocessing function to the summary and description columns
df_cleaned['cleaned_summary'] = df_cleaned['summary'].apply(preprocess_text)
df_cleaned['cleaned_description'] = df_cleaned['description'].apply(preprocess_text)

# Save your final cleaned DataFrame to a new CSV
output_path = "/content/drive/MyDrive/Portfolio/Data_analysis_projects/ApacheKafka_Bug_Analysis/data/processed/kafka_bug_reports_cleaned.csv"
df_cleaned.to_csv(output_path, index=False)
print(f"Cleaned data saved to {output_path}")

# Display the cleaned text for the first row to confirm
print("\nExample of cleaned text:")
print("Original Summary:", df_cleaned.loc[0, 'summary'])
print("Cleaned Summary:", df_cleaned.loc[0, 'cleaned_summary'])
print("Original Description:", df_cleaned.loc[0, 'description'])
print("Cleaned Description:", df_cleaned.loc[0, 'cleaned_description'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

# Download necessary NLTK data (this is the corrected section)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # This line has been added to fix the LookupError

# Create a function to clean the text
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text) # Remove non-alphabetic characters
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    lemmas = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1]
    return " ".join(lemmas)

# Apply the preprocessing function to the summary and description columns
df_cleaned['cleaned_summary'] = df_cleaned['summary'].apply(preprocess_text)
df_cleaned['cleaned_description'] = df_cleaned['description'].apply(preprocess_text)

# Save your final cleaned DataFrame to a new CSV
output_path = "/content/drive/MyDrive/Portfolio/Data_analysis_projects/ApacheKafka_Bug_Analysis/data/processed/kafka_bug_reports_cleaned.csv"
df_cleaned.to_csv(output_path, index=False)
print(f"Cleaned data saved to {output_path}")

# Display the cleaned text for the first row to confirm
print("\nExample of cleaned text:")
print("Original Summary:", df_cleaned.loc[0, 'summary'])
print("Cleaned Summary:", df_cleaned.loc[0, 'cleaned_summary'])
print("Original Description:", df_cleaned.loc[0, 'description'])
print("Cleaned Description:", df_cleaned.loc[0, 'cleaned_description'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Cleaned data saved to /content/drive/MyDrive/Portfolio/Data_analysis_projects/ApacheKafka_Bug_Analysis/data/processed/kafka_bug_reports_cleaned.csv

Example of cleaned text:
Original Summary: LastKnownLeader should only be elected if it is unfenced
Cleaned Summary: lastknownleader elected unfenced
Original Description: In PartitionChangeBuilder, there is a bug that even if the laterKnownLeader is fenced, it can still be a leader
Cleaned Description: partitionchangebuilder bug even laterknownleader fenced still leader
