<a href="https://colab.research.google.com/github/RoseMariaGeorge-git/GAPE_Agriculture/blob/main/GAPE_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

base_dir = "/content/GAPE_Agriculture"
os.makedirs(base_dir, exist_ok=True)
os.chdir(base_dir)

print("Working directory:", os.getcwd())

Working directory: /content/GAPE_Agriculture


In [None]:
corpus_text = """A farmer plans the season by selecting seed varieties based on soil moisture rainfall forecast and pest resistance
Field preparation includes ploughing levelling and applying organic manure to improve soil health
During growth irrigation scheduling and nutrient management are monitored using drip lines and fertigation
Harvest timing depends on maturity index market price and storage availability
"""

with open("corpus.txt", "w") as f:
    f.write(corpus_text)

print("corpus.txt created")


corpus.txt created


In [None]:
import re
import csv
from collections import Counter

WINDOW_SIZE = 4

with open("corpus.txt") as f:
    text = f.read().lower()

tokens = re.findall(r'\b\w+\b', text)

# Vocabulary
word_freq = Counter(tokens)
vocab = {word: idx for idx, word in enumerate(word_freq)}

with open("vocab.txt", "w") as f:
    for w, i in vocab.items():
        f.write(f"{w} {i} {word_freq[w]}\n")

cbow_data, skipgram_data = [], []

for i, target in enumerate(tokens):
    context = []
    for j in range(i - WINDOW_SIZE, i + WINDOW_SIZE + 1):
        if j != i and 0 <= j < len(tokens):
            context.append(vocab[tokens[j]])
    if context:
        cbow_data.append((context, vocab[target]))
        for c in context:
            skipgram_data.append((vocab[target], c))

with open("cbow_dataset.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["context_ids", "target_id"])
    for c, t in cbow_data:
        writer.writerow([" ".join(map(str, c)), t])

with open("skipgram_dataset.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["input_id", "output_id"])
    for i, o in skipgram_data:
        writer.writerow([i, o])

print("Vocabulary & datasets generated")


Vocabulary & datasets generated


In [None]:
# STEP 4: GENERATE CBOW & SKIP-GRAM DATASETS
# -------------------------------
cbow_data, skipgram_data = [], []

for i, target in enumerate(tokens):
    context = []
    for j in range(i - WINDOW_SIZE, i + WINDOW_SIZE + 1):
        if j != i and 0 <= j < len(tokens):
            context.append(vocab[tokens[j]])
    if context:
        # CBOW
        cbow_data.append((context, vocab[target]))
        # Skip-gram
        for c in context:
            skipgram_data.append((vocab[target], c))

# Save CBOW dataset
with open("cbow_dataset.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["context_ids", "target_id"])
    for c, t in cbow_data:
        writer.writerow([" ".join(map(str, c)), t])

# Save Skip-gram dataset
with open("skipgram_dataset.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["input_id", "output_id"])
    for i, o in skipgram_data:
        writer.writerow([i, o])

print("CBOW & Skip-gram datasets created")

CBOW & Skip-gram datasets created


In [None]:
import numpy as np
import csv
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

EMBEDDING_DIM = 10
EPOCHS = 100
LEARNING_RATE = 0.05

vocab_size = sum(1 for _ in open("vocab.txt"))

def one_hot(i, size):
    v = np.zeros(size)
    v[i] = 1
    return v

X, y = [], []

with open("skipgram_dataset.csv") as f:
    reader = csv.DictReader(f)
    for row in reader:
        X.append(one_hot(int(row["input_id"]), vocab_size))
        y.append(one_hot(int(row["output_id"]), vocab_size))

X, y = np.array(X), np.array(y)

model_sg = Sequential([
    Input(shape=(vocab_size,)),
    Dense(EMBEDDING_DIM),
    Dense(vocab_size, activation="softmax")
])

model_sg.compile(
    optimizer=Adam(learning_rate=LEARNING_RATE),
    loss="categorical_crossentropy"
)

history = model_sg.fit(X, y, epochs=EPOCHS, verbose=0)

with open("loss_skipgram.txt", "w") as f:
    for l in history.history["loss"]:
        f.write(str(l) + "\n")

embeddings = model_sg.layers[0].get_weights()[0]
np.savetxt("embeddings_skipgram.csv", embeddings, delimiter=",")

print("Skip-gram training completed successfully")

Skip-gram training completed successfully


In [None]:
# STEP 5: CBOW & SKIP-GRAM TRAINING
# -------------------------------
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# PARAMETERS
EMBEDDING_DIM = 10
EPOCHS = 50
LEARNING_RATE = 0.05
BATCH_SIZE = 1

vocab_size = len(vocab)

# One-hot encoding
def one_hot(i, size):
    v = np.zeros(size)
    v[i] = 1
    return v

# -------------------------------
# CBOW DATA
# -------------------------------
X_cbow, y_cbow = [], []

with open("cbow_dataset.csv") as f:
    reader = csv.DictReader(f)
    for row in reader:
        context_ids = list(map(int, row["context_ids"].split()))
        target_id = int(row["target_id"])
        context_vec = np.mean([one_hot(i, vocab_size) for i in context_ids], axis=0)
        X_cbow.append(context_vec)
        y_cbow.append(one_hot(target_id, vocab_size))

X_cbow, y_cbow = np.array(X_cbow), np.array(y_cbow)

# CBOW MODEL
model_cbow = Sequential([
    Dense(EMBEDDING_DIM, input_shape=(vocab_size,), activation='sigmoid'),
    Dense(vocab_size, activation='softmax')
])

model_cbow.compile(
    optimizer=Adam(learning_rate=LEARNING_RATE),
    loss="categorical_crossentropy"
)

history_cbow = model_cbow.fit(
    X_cbow, y_cbow,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1
)

np.savetxt("loss_cbow.txt", history_cbow.history['loss'])

# Extract & normalize embeddings (0-1)
embeddings_cbow = model_cbow.layers[0].get_weights()[0]
embeddings_cbow_01 = (embeddings_cbow - embeddings_cbow.min()) / (embeddings_cbow.max() - embeddings_cbow.min())

pd.DataFrame(embeddings_cbow_01, index=list(vocab.keys()), columns=[f"dim_{i+1}" for i in range(EMBEDDING_DIM)]).to_csv("embeddings_cbow_0_1.csv")
print("CBOW training done and embeddings saved (0-1 normalized)")

# -------------------------------
# SKIP-GRAM DATA
# -------------------------------
X_sg, y_sg = [], []

with open("skipgram_dataset.csv") as f:
    reader = csv.DictReader(f)
    for row in reader:
        target_id = int(row["input_id"])
        output_id = int(row["output_id"])
        X_sg.append(one_hot(target_id, vocab_size))
        y_sg.append(one_hot(output_id, vocab_size))

X_sg, y_sg = np.array(X_sg), np.array(y_sg)

# SKIP-GRAM MODEL
model_sg = Sequential([
    Dense(EMBEDDING_DIM, input_shape=(vocab_size,), activation='sigmoid'),
    Dense(vocab_size, activation='softmax')
])

model_sg.compile(
    optimizer=Adam(learning_rate=LEARNING_RATE),
    loss="categorical_crossentropy"
)

history_sg = model_sg.fit(
    X_sg, y_sg,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1
)

np.savetxt("loss_skipgram.txt", history_sg.history['loss'])

# Extract & normalize embeddings (0-1)
embeddings_sg = model_sg.layers[0].get_weights()[0]
embeddings_sg_01 = (embeddings_sg - embeddings_sg.min()) / (embeddings_sg.max() - embeddings_sg.min())

pd.DataFrame(embeddings_sg_01, index=list(vocab.keys()), columns=[f"dim_{i+1}" for i in range(EMBEDDING_DIM)]).to_csv("embeddings_skipgram_0_1.csv")
print("Skip-gram training done and embeddings saved (0-1 normalized)")


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 4.7512
Epoch 2/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 4.0128
Epoch 3/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 3.8863
Epoch 4/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.7591
Epoch 5/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.8969
Epoch 6/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.8487
Epoch 7/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.9728
Epoch 8/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.8768
Epoch 9/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.8086
Epoch 10/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.4506
Epoch 11/50
[1m56/

In [None]:
# DISPLAY ALL EMBEDDINGS
# -------------------------------
pd.set_option('display.max_rows', None)
pd.set_option('display.precision', 4)
pd.set_option('display.width', 200)

print("\nCBOW Embeddings (0-1 normalized):")
display(pd.DataFrame(embeddings_cbow_01, index=list(vocab.keys()), columns=[f"dim_{i+1}" for i in range(EMBEDDING_DIM)]))

print("\nSkip-gram Embeddings (0-1 normalized):")
display(pd.DataFrame(embeddings_sg_01, index=list(vocab.keys()), columns=[f"dim_{i+1}" for i in range(EMBEDDING_DIM)]))




CBOW Embeddings (0-1 normalized):


Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10
a,0.4612,0.5012,0.3093,0.6998,0.4387,0.4768,0.5735,0.8162,0.4293,0.4195
farmer,0.6665,0.6385,0.501,0.7125,0.6584,0.5243,0.3575,0.5309,0.5398,0.4457
plans,0.5412,0.5921,0.4481,0.741,0.8398,0.4613,0.5851,0.6858,0.6295,0.4767
the,0.7246,0.326,0.3376,0.7124,0.6834,0.4708,0.7447,0.465,0.524,0.4656
season,0.5288,0.7274,0.386,0.7043,0.7217,0.6067,0.7717,0.611,0.4849,0.4814
by,0.3503,0.4817,0.3541,0.6962,0.4448,0.4439,0.7318,0.7979,0.629,0.4307
selecting,0.6878,0.7269,0.6388,0.8116,0.2737,0.5587,0.3593,0.5252,0.7552,0.4399
seed,0.5144,0.682,0.7749,0.7874,0.7687,0.4889,0.485,0.8196,0.93,0.4683
varieties,0.7691,0.355,0.8288,0.6658,0.6573,0.4933,0.6045,0.4439,1.0,0.5177
based,0.7143,0.8108,0.7101,0.5979,0.7132,0.8255,0.7253,0.4621,0.9982,0.5626



Skip-gram Embeddings (0-1 normalized):


Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10
a,0.6695,0.4029,0.207,0.0259,0.2016,0.1235,1.0,0.1024,0.6074,0.1975
farmer,0.6849,0.678,0.2473,0.0963,0.3121,0.1734,0.6938,0.1379,0.3041,0.3401
plans,0.6759,0.1768,0.1419,0.1885,0.3049,0.1344,0.7037,0.0799,0.2783,0.6592
the,0.1991,0.2182,0.1625,0.1661,0.7025,0.1112,0.6614,0.1714,0.3446,0.2994
season,0.2164,0.2141,0.2174,0.5047,0.3971,0.0285,0.6868,0.2239,0.3491,0.2931
by,0.2379,0.2111,0.1787,0.1083,0.2114,0.0025,0.6353,0.2451,0.6238,0.4095
selecting,0.4695,0.673,0.3984,0.2096,0.2686,0.1605,0.5956,0.4074,0.6186,0.6078
seed,0.6883,0.3732,0.2733,0.1915,0.2464,0.2474,0.5748,0.2745,0.6142,0.6843
varieties,0.2793,0.173,0.1995,0.1431,0.4277,0.1193,0.4939,0.264,0.5815,0.7409
based,0.3501,0.2265,0.314,0.5436,0.7008,0.1098,0.4211,0.2727,0.6294,0.6957


In [None]:
readme_text = """# GAPE Programming Assignment – Word Embeddings

## Domain
Agriculture (Crop Farming)

## Framework
Keras (TensorFlow backend)

---

## Corpus
- Single paragraph about agriculture practices:
  - Crop planning and seed selection
  - Soil moisture, rainfall, and pest resistance
  - Field preparation and irrigation
  - Harvest timing, market price, and storage
- Text is preprocessed (lowercase, punctuation removed, tokenized).

---

## Vocabulary
- Unique words are assigned IDs and frequency.
- Saved in `vocab.txt`.

---

## Models
1. **CBOW (Continuous Bag of Words)**
   - Input: Context words (window size 4)
   - Output: Target word
2. **Skip-gram**
   - Input: Target word
   - Output: Context words
- Both models use **one-hot encoding**.
- No pretrained embeddings used.

---

## Parameters
- Embedding dimension = 10
- Context window = 4
- Epochs = 50
- Learning rate = 0.05

---

## Files
- `corpus.txt` – Original text
- `vocab.txt` – Word IDs and frequency
- `cbow_dataset.csv` – CBOW training data
- `skipgram_dataset.csv` – Skip-gram training data
- `embeddings_cbow_0_1.csv` – CBOW embeddings (0-1)
- `embeddings_skipgram_0_1.csv` – Skip-gram embeddings (0-1)
- `loss_cbow.txt` – CBOW loss per epoch
- `loss_skipgram.txt` – Skip-gram loss per epoch
- `README.md` – Project description

---

## Usage
1. Run the scripts in Google Colab or Python environment with TensorFlow/Keras.
2. The scripts will preprocess the text, generate datasets, train CBOW and Skip-gram models, and save embeddings scaled to 0-1.
3. Embeddings can be used for similarity analysis or nearest-neighbor queries.

---

## Output
- 10-dimensional embeddings for each word.
- Loss per epoch saved in text files.
- Example embedding (range 0-1):
''



"""

with open("README.md", "w") as f:
  f.write(readme_text)

print("README.md created ")


README.md created 


In [None]:
!zip -r GAPE_Agriculture.zip .


updating: corpus.txt (deflated 40%)
updating: embeddings_skipgram.csv (deflated 54%)
updating: skipgram_dataset.csv (deflated 70%)
updating: README.md (deflated 51%)
updating: embeddings_cbow.csv (deflated 56%)
updating: loss_skipgram.txt (deflated 57%)
updating: similarity_results.txt (deflated 50%)
updating: loss_cbow.txt (deflated 54%)
updating: cbow_dataset.csv (deflated 70%)
updating: vocab.txt (deflated 44%)
updating: embeddings_skipgram_0_1.csv (deflated 52%)
updating: embeddings_cbow_0_1.csv (deflated 52%)


CHECKING THE FILE CONTENTS


In [None]:
!ls


cbow_dataset.csv	     embeddings_skipgram.csv  similarity_results.txt
corpus.txt		     GAPE_Agriculture.zip     skipgram_dataset.csv
embeddings_cbow_0_1.csv      loss_cbow.txt	      vocab.txt
embeddings_cbow.csv	     loss_skipgram.txt
embeddings_skipgram_0_1.csv  README.md


In [None]:
with open("corpus.txt") as f:
    print(f.read())


A farmer plans the season by selecting seed varieties based on soil moisture rainfall forecast and pest resistance
Field preparation includes ploughing levelling and applying organic manure to improve soil health
During growth irrigation scheduling and nutrient management are monitored using drip lines and fertigation
Harvest timing depends on maturity index market price and storage availability



In [None]:
with open("vocab.txt") as f:
    for i in range(10):
        print(f.readline().strip())


a 0 1
farmer 1 1
plans 2 1
the 3 1
season 4 1
by 5 1
selecting 6 1
seed 7 1
varieties 8 1
based 9 1


In [None]:
import pandas as pd

cbow = pd.read_csv("cbow_dataset.csv")
cbow.head()


Unnamed: 0,context_ids,target_id
0,1 2 3 4,0
1,0 2 3 4 5,1
2,0 1 3 4 5 6,2
3,0 1 2 4 5 6 7,3
4,0 1 2 3 5 6 7 8,4


In [None]:
skipgram = pd.read_csv("skipgram_dataset.csv")
skipgram.head()


Unnamed: 0,input_id,output_id
0,0,1
1,0,2
2,0,3
3,0,4
4,1,0


In [None]:
with open("similarity_results.txt") as f:
    print(f.read())



Query word: soil
timing: 0.4378
pest: 0.3984
ploughing: 0.3386
selecting: 0.3368
market: 0.3219

Query word: seed
rainfall: 0.7572
varieties: 0.6746
selecting: 0.5411
manure: 0.5152
forecast: 0.4552

Query word: irrigation
forecast: 0.6733
to: 0.5696
organic: 0.4801
based: 0.4480
resistance: 0.4237

Query word: harvest
applying: 0.7200
market: 0.4173
farmer: 0.4136
timing: 0.4079
fertigation: 0.3966

Query word: pest
levelling: 0.6608
ploughing: 0.5845
resistance: 0.4603
drip: 0.4483
soil: 0.3984



In [None]:
!unzip -l GAPE_Agriculture.zip


Archive:  GAPE_Agriculture.zip
  Length      Date    Time    Name
---------  ---------- -----   ----
      399  2026-01-11 16:42   corpus.txt
    12739  2026-01-11 16:43   embeddings_skipgram.csv
     2876  2026-01-11 17:05   skipgram_dataset.csv
     1796  2026-01-11 17:10   README.md
    12758  2026-01-11 17:01   embeddings_cbow.csv
     1250  2026-01-11 17:06   loss_skipgram.txt
      503  2026-01-11 16:45   similarity_results.txt
     1250  2026-01-11 17:05   loss_cbow.txt
     1451  2026-01-11 17:05   cbow_dataset.csv
      615  2026-01-11 16:42   vocab.txt
     5787  2026-01-11 17:06   embeddings_skipgram_0_1.csv
     5666  2026-01-11 17:05   embeddings_cbow_0_1.csv
---------                     -------
    47090                     12 files


In [None]:
!zip -r GAPE_Agriculture.zip .


updating: corpus.txt (deflated 40%)
updating: embeddings_skipgram.csv (deflated 54%)
updating: skipgram_dataset.csv (deflated 70%)
updating: README.md (deflated 51%)
updating: embeddings_cbow.csv (deflated 56%)
updating: loss_skipgram.txt (deflated 57%)
updating: similarity_results.txt (deflated 50%)
updating: loss_cbow.txt (deflated 54%)
updating: cbow_dataset.csv (deflated 70%)
updating: vocab.txt (deflated 44%)
updating: embeddings_skipgram_0_1.csv (deflated 52%)
updating: embeddings_cbow_0_1.csv (deflated 52%)


In [None]:
from google.colab import files

# Replace with your ZIP filename
files.download("GAPE_Agriculture.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>