In [1]:
# import required libraries

import pandas as pd # dataframes
import numpy as np #mathematical operations

# to visualise

import plotly.express as px # plotly
from wordcloud import WordCloud # wordclouds
import matplotlib.pyplot as plt #matplot
import seaborn as sns #seaborn

# preprocessing

import re #Regular expressions
import nltk #NLP library
import spacy
import string
from bs4 import BeautifulSoup #HTML tags
from nltk.corpus import stopwords #english stopwords
#lematising
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Vectorisation

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# Treating class imbalanced data

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

# model building

from sklearn.model_selection import train_test_split

# algorithms

from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

# checking up with models

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# confusion matrix
from sklearn import metrics
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# landing google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# load your dataframe
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Research/Data Frames/document-dataset.csv")

In [None]:
#observing data frame
df.head()

Unnamed: 0,fileclass,content
0,ChildProtection,TRENDS IN ONLINE CHILD SEXUAL ABUSE MATERIAL1\...
1,ChildProtection,COVID-19: DIGITAL AND REMOTE \nAPPROACHES IN E...
2,ChildProtection,\nJournal Pre-proof\nDigital media use and su...
3,ChildProtection,OrigiNAL Ar TiCLE\nsexual Exploitation and Abu...
4,ChildProtection,COVID-19 and its implications \nfor protecting...


In [None]:
#data types
df.dtypes

fileclass    object
content      object
dtype: object

In [None]:
# change both types into string
df['content'] = df['content'].astype('str')
df['fileclass'] = df['fileclass'].astype('str')

In [None]:
# size of the data frame
df.size

1702

In [None]:
# spread of the data frame
print("No. of text files: ",df.shape[0])

No. of text files:  851


In [None]:
df = df.drop_duplicates(subset=['content'],keep="first")

In [None]:
# Count the number of items per group
group_counts = df.groupby("fileclass").count()
group_counts=group_counts.reset_index()

# Create the bar chart
fig = px.bar(group_counts, x='fileclass', y='content',color="fileclass")

# Show the chart
fig.show()

In [None]:
# Filter the DataFrame for intervention areas
intervention_counts = df['fileclass'].value_counts().reset_index()

# Rename the columns in the count table
intervention_counts.columns = ['Intervention Area', 'Count']

# Sort the count table by the intervention areas
intervention_counts = intervention_counts.sort_values('Intervention Area')

# Display the count table
print(intervention_counts)


            Intervention Area  Count
8             ChildProtection     26
5               Cybersecurity     40
11                DataPrivacy     13
2      DataSystemsDevelopment     84
0              DigitalFinance    149
1            DigitalInclusion     89
3   DigitalInformatioServices     59
6       DigitalInfrastructure     34
10            DigitalLiteracy     15
7             DigitalServices     31
4                 Egovernment     50
9                  Upskilling     25


In [None]:
df['content'] = df['content'].astype('str')
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

df["content"] = df["content"].apply(lambda text: remove_urls(text))

df.content.iloc[70]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['content'] = df['content'].astype('str')




In [None]:
def remove_html(text):
    return BeautifulSoup(text, "lxml").text

df["content"] = df["content"].apply(lambda text: remove_html(text))

df.content.iloc[70]



In [None]:
def remove_emails(text):
    email_pattern = re.compile(r'\S+@\S+')
    return email_pattern.sub(r'', text)

df["content"] = df["content"].apply(lambda text: remove_emails(text))
df.content.iloc[70]



In [None]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["content"] = df["content"].apply(lambda text: remove_punctuation(text))
df.content.iloc[70]



In [None]:
def remove_numbers(text):
    number_pattern = re.compile(r'\d+')
    return number_pattern.sub(r'', text)

df["content"] = df["content"].apply(lambda text: remove_numbers(text))
df.content.iloc[70]



In [None]:
def remove_non_alpha(text):
    non_alpha_pattern = re.compile(r'[^a-zA-Z\s]')
    return non_alpha_pattern.sub(r'', text)

df["content"] = df["content"].apply(lambda text: remove_non_alpha(text))

In [None]:
def remove_extra_whitespaces(text):
    whitespace_pattern = re.compile(r'\s+')
    return whitespace_pattern.sub(' ', text)

df["content"] = df["content"].apply(lambda text: remove_extra_whitespaces(text))
df.content.iloc[70]



In [None]:
def remove_single_letter_words(text):
    return " ".join([word for word in str(text).split() if len(word) > 1])

df["content"] = df["content"].apply(lambda text: remove_single_letter_words(text))
df.content.iloc[70]



In [3]:
!pip install nlpaug


Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [12]:
input = "This article reflects on information and communication technology from the perspective of the Anthropology of Technique"

In [5]:
import nlpaug.augmenter.word as naw

# Define the augmentation function
def augment_text(text, n=20):
    augmented_texts = []
    augmenter = naw.SynonymAug(aug_src='wordnet')

    while len(augmented_texts) < n:
        augmented_text = augmenter.augment(text)
        if augmented_text != text and augmented_text not in augmented_texts:
            augmented_texts.append(augmented_text)

    return augmented_texts

In [13]:
augment_text(input, 1)

[['This article shine on selective information and communication technology from the perspective of the Anthropology of Proficiency']]

In [2]:
import pandas as pd
import nlpaug.augmenter.word as naw

# Filter the DataFrame for the Child Protection intervention area
dig_df = df[df['fileclass'] == 'DigitalLiteracy']

# Get the unique content from the Child Protection intervention area
contents = dig_df['content']

# Define the augmentation function
def augment_text(text, n=20):
    augmented_texts = []
    augmenter = naw.SynonymAug(aug_src='wordnet')

    while len(augmented_texts) < n:
        augmented_text = augmenter.augment(text)
        if augmented_text != text and augmented_text not in augmented_texts:
            augmented_texts.append(augmented_text)

    return augmented_texts

# Generate augmented contents for all rows
augmented_contents = []
for content in contents:
    augmented_texts = augment_text(content, n=1)  # You can adjust the number of augmentations per row
    #augmented_texts.append(content)  # Add the original content
    augmented_texts.extend(augmented_texts)  # Add augmented content
    augmented_contents.extend(augmented_texts)

# Create a DataFrame with the augmented contents
augmented_df = pd.DataFrame({'content': augmented_contents})

# Sample as many records as there are in the augmented DataFrame, but at most 100
sample_size = min(100, len(augmented_df))
augmented_df = augmented_df.sample(n=sample_size, random_state=42)

# Print the augmented DataFrame
print(augmented_df)

ModuleNotFoundError: ignored

In [None]:
# Save the augmented DataFrame as a CSV file
augmented_df.to_csv('DigitalLiteracy_aug.csv', index=False)

# Download the CSV file
from google.colab import files
files.download('DigitalLiteracy_aug.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>