In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:

data_path = '/content/drive/MyDrive/stacksample/'


In [3]:
import pandas as pd
questions_df = pd.read_csv(data_path + 'Questions.csv', encoding='ISO-8859-1')
tags_df = pd.read_csv(data_path + 'Tags.csv', encoding='ISO-8859-1')
print("Questions:")
print(questions_df.head())
print("\nTags:")
print(tags_df.head())

Questions:
    Id  OwnerUserId          CreationDate            ClosedDate  Score  \
0   80         26.0  2008-08-01T13:57:07Z                   NaN     26   
1   90         58.0  2008-08-01T14:41:24Z  2012-12-26T03:45:49Z    144   
2  120         83.0  2008-08-01T15:50:08Z                   NaN     21   
3  180    2089740.0  2008-08-01T18:42:19Z                   NaN     53   
4  260         91.0  2008-08-01T23:22:08Z                   NaN     49   

                                               Title  \
0  SQLStatement.execute() - multiple queries in o...   
1  Good branching and merging tutorials for Torto...   
2                                  ASP.NET Site Maps   
3                 Function for creating color wheels   
4  Adding scripting functionality to .NET applica...   

                                                Body  
0  <p>I've written a database generation script i...  
1  <p>Are there any really good tutorials explain...  
2  <p>Has anyone got experience creating <

In [4]:

top_tags = tags_df['Tag'].value_counts().head(10).index.tolist()
filtered_tags_df = tags_df[tags_df['Tag'].isin(top_tags)]
print(f"Top 10 tags: {top_tags}")

Top 10 tags: ['javascript', 'java', 'c#', 'php', 'android', 'jquery', 'python', 'html', 'c++', 'ios']


In [5]:
# Merge the questions and filtered tags based on the question ID
merged_df = pd.merge(questions_df, filtered_tags_df, left_on='Id', right_on='Id')
print(merged_df.head())

     Id  OwnerUserId          CreationDate ClosedDate  Score  \
0   260         91.0  2008-08-01T23:22:08Z        NaN     49   
1   330         63.0  2008-08-02T02:51:36Z        NaN     29   
2   650        143.0  2008-08-03T11:12:52Z        NaN     79   
3   930        245.0  2008-08-04T00:47:25Z        NaN     28   
4  1010         67.0  2008-08-04T03:59:42Z        NaN     14   

                                               Title  \
0  Adding scripting functionality to .NET applica...   
1          Should I use nested classes in this case?   
2                Automatically update version number   
3  How do I connect to a database and loop over a...   
4  How to get the value of built, encoded ViewState?   

                                                Body  Tag  
0  <p>I have a little game written in C#. It uses...   c#  
1  <p>I am working on a collection of classes use...  c++  
2  <p>I would like the version property of my app...   c#  
3  <p>What's the simplest way to conne

In [6]:
import nltk

# Downloading the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Optional WordNet dependencies
nltk.download('averaged_perceptron_tagger')  # For POS tagging, if needed


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re


# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define the clean_text function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenize text
    words = word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Join words back into a single string
    return ' '.join(words)



In [12]:
!rm -rf /root/nltk_data

In [13]:
import nltk

# Downloading the necessary NLTK packages again
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [15]:
!rm -rf /root/nltk_data


In [16]:
!wget https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip
!unzip stopwords.zip -d /root/nltk_data/corpora/


--2024-11-18 07:53:10--  https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34276 (33K) [application/zip]
Saving to: ‘stopwords.zip’


2024-11-18 07:53:10 (18.9 MB/s) - ‘stopwords.zip’ saved [34276/34276]

Archive:  stopwords.zip
checkdir:  cannot create extraction directory: /root/nltk_data/corpora
           No such file or directory


In [17]:
import os
print(os.path.exists('/root/nltk_data/corpora/stopwords'))
print(os.listdir('/root/nltk_data/corpora/stopwords'))

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print("Stopwords loaded successfully!")


False


FileNotFoundError: [Errno 2] No such file or directory: '/root/nltk_data/corpora/stopwords'

In [18]:
import nltk
import os

# Create a custom directory
nltk_data_dir = '/content/nltk_data'
os.makedirs(nltk_data_dir, exist_ok=True)

# Download stopwords into the new directory
nltk.download('stopwords', download_dir=nltk_data_dir)
nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('wordnet', download_dir=nltk_data_dir)
nltk.download('omw-1.4', download_dir=nltk_data_dir)
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_dir)

# Verify if stopwords are downloaded
print(os.listdir(f"{nltk_data_dir}/corpora/stopwords"))


[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /content/nltk_data...
[nltk_data] Downloading package omw-1.4 to /content/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /content/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


['hebrew', 'azerbaijani', 'english', 'arabic', 'tajik', 'slovene', 'russian', 'portuguese', 'romanian', 'nepali', 'basque', 'german', 'kazakh', 'norwegian', 'danish', 'indonesian', 'chinese', 'catalan', 'italian', 'spanish', 'dutch', 'french', 'finnish', 'swedish', 'bengali', 'turkish', 'hungarian', 'README', 'greek', 'hinglish']


In [20]:
!pip install stopwordsiso

import stopwordsiso as stopwords
stop_words = stopwords.stopwords("en")
print("Alternative stopwords loaded successfully!")



Collecting stopwordsiso
  Downloading stopwordsiso-0.6.1-py3-none-any.whl.metadata (2.5 kB)
Downloading stopwordsiso-0.6.1-py3-none-any.whl (73 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stopwordsiso
Successfully installed stopwordsiso-0.6.1
Alternative stopwords loaded successfully!


In [21]:
import re
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function to clean the text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]




In [24]:
print(merged_df.columns)

Index(['Id', 'OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'Title',
       'Body', 'Tag'],
      dtype='object')


In [25]:

merged_df['Cleaned_Body'] = merged_df['Body'].apply(clean_text)

print(merged_df[['Body', 'Cleaned_Body']].head())


                                                Body Cleaned_Body
0  <p>I have a little game written in C#. It uses...         None
1  <p>I am working on a collection of classes use...         None
2  <p>I would like the version property of my app...         None
3  <p>What's the simplest way to connect and quer...         None
4  <p>I need to grab the base64-encoded represent...         None


In [27]:
from sklearn.model_selection import train_test_split

X = merged_df['Cleaned_Body']
y = merged_df['Tag'].apply(lambda x: [tag for tag in x.split(',')])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data size: {len(X_train)}")
print(f"Validation data size: {len(X_val)}")


Training data size: 661391
Validation data size: 165348


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for faster computation

# Fit and transform the training data
X_train_vect = vectorizer.fit_transform(X_train)

# Transform the validation data
X_val_vect = vectorizer.transform(X_val)

print(f"Feature matrix shape for training data: {X_train_vect.shape}")
print(f"Feature matrix shape for validation data: {X_val_vect.shape}")


AttributeError: 'NoneType' object has no attribute 'lower'

In [30]:
X_train = X_train[X_train.str.strip().str.len() > 0]

In [31]:
print(X_train.head())

Series([], Name: Cleaned_Body, dtype: object)


In [32]:
print(merged_df['Cleaned_Body'].isnull().sum())

826739


In [33]:
merged_df['Cleaned_Body'] = merged_df['Body'].apply(clean_text)

In [34]:
print(merged_df['Cleaned_Body'].isnull().sum())

826739


In [36]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]


    return ' '.join(words)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [37]:
print(merged_df['Cleaned_Body'].isnull().sum())
print((merged_df['Cleaned_Body'] == '').sum())

826739
0


In [38]:
print(merged_df['Body'].head(20))

0     <p>I have a little game written in C#. It uses...
1     <p>I am working on a collection of classes use...
2     <p>I would like the version property of my app...
3     <p>What's the simplest way to connect and quer...
4     <p>I need to grab the base64-encoded represent...
5     <p>I'm looking for a way to delete a file whic...
6     <p>Getting back into a bit more .NET after a f...
7     <p>I currently use a DataTable to get results ...
8     <p>How do you disable <code>autocomplete</code...
9     <p>Let's say that we have an ARGB color:</p>\n...
10    <p>I am getting the following error:</p>\n\n<b...
11    <p>I'm having trouble figuring out how to get ...
12    <p>I'm wondering how to make a release build t...
13    <p>What code analysis tools do you use on your...
14    <p>Is there available any tool for PHP which c...
15    <p>How is it possible to make prototype method...
16    <p>Example: I have two shared objects (same sh...
17    <p>I am new to C# and am doing some work i

In [39]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):

    if not isinstance(text, str):
        return ''
    text = BeautifulSoup(text, "html.parser").get_text()
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words) if words else ''


In [40]:
!pip install beautifulsoup4



In [43]:
print(merged_df['Cleaned_Body'].isnull().sum())
print((merged_df['Cleaned_Body'] == '').sum())

0
5


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(merged_df['Cleaned_Body'])

print(f"Shape of the vectorized data: {X.shape}")

Shape of the vectorized data: (826739, 5000)


In [45]:
from sklearn.preprocessing import MultiLabelBinarizer
y = merged_df['Tag'].apply(lambda x: x.split(','))
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)

print(f"Shape of the label data: {y.shape}")


Shape of the label data: (826739, 10)


In [46]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set shape: {X_train.shape}, Validation set shape: {X_val.shape}")


Training set shape: (661391, 5000), Validation set shape: (165348, 5000)


In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score


classifier = OneVsRestClassifier(LogisticRegression(max_iter=1000))

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.5290478264024966


In [49]:
!git config --global user.name "TanveerBakshi"
!git config --global user.email "tbakshi_be20@thapar.edu"

fatal: not a git repository (or any of the parent directories): .git


In [54]:
!git init

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/


In [56]:
!git config --global user.name "TanveerBakshi"
!git config --global user.email "tbakshi_be20@thapar.edu"

In [57]:
!git add .

In [58]:
!git commit -m "file"

[master (root-commit) fa09aea] file
 961 files changed, 6284477 insertions(+)
 create mode 100644 .config/.last_opt_in_prompt.yaml
 create mode 100644 .config/.last_survey_prompt.yaml
 create mode 100644 .config/.last_update_check.json
 create mode 100644 .config/active_config
 create mode 100644 .config/config_sentinel
 create mode 100644 .config/configurations/config_default
 create mode 100644 .config/default_configs.db
 create mode 100644 .config/gce
 create mode 100644 .config/hidden_gcloud_config_universe_descriptor_data_cache_configs.db
 create mode 100644 .config/logs/2024.11.14/14.24.26.637550.log
 create mode 100644 .config/logs/2024.11.14/14.24.48.387435.log
 create mode 100644 .config/logs/2024.11.14/14.25.01.300026.log
 create mode 100644 .config/logs/2024.11.14/14.25.02.296080.log
 create mode 100644 .config/logs/2024.11.14/14.25.14.950547.log
 create mode 100644 .config/logs/2024.11.14/14.25.15.677507.log
 create mode 100644 drive/MyDrive/Colab Notebooks/Untitled0.ipynb


In [2]:
!git remote -v


fatal: not a git repository (or any of the parent directories): .git
